Skip to content

Commit 7ad56db

Browse files
committed
feat: transform function
1 parent 4a5fe91 commit 7ad56db

17 files changed

+527
-142
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ set(ICEBERG_SOURCES
3131
statistics_file.cc
3232
table_metadata.cc
3333
transform.cc
34+
transform_function.cc
3435
type.cc)
3536

3637
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)

src/iceberg/json_internal.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ nlohmann::json ToJson(const SortOrder& sort_order) {
8585
expected<std::unique_ptr<SortField>, Error> SortFieldFromJson(
8686
const nlohmann::json& json) {
8787
TRY_ASSIGN(transform_str, GetJsonValue<std::string>(json, kTransform));
88-
TRY_ASSIGN(transform, TransformFunctionFromString(transform_str));
88+
TRY_ASSIGN(transform, TransformFromString(transform_str));
8989
TRY_ASSIGN(source_id, GetJsonValue<int32_t>(json, kSourceId));
9090
TRY_ASSIGN(direction_str, GetJsonValue<std::string>(json, kDirection));
9191
TRY_ASSIGN(direction, SortDirectionFromString(direction_str));

src/iceberg/partition_field.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
namespace iceberg {
2929

3030
PartitionField::PartitionField(int32_t source_id, int32_t field_id, std::string name,
31-
std::shared_ptr<TransformFunction> transform)
31+
std::shared_ptr<Transform> transform)
3232
: source_id_(source_id),
3333
field_id_(field_id),
3434
name_(std::move(name)),
@@ -40,9 +40,7 @@ int32_t PartitionField::field_id() const { return field_id_; }
4040

4141
std::string_view PartitionField::name() const { return name_; }
4242

43-
std::shared_ptr<TransformFunction> const& PartitionField::transform() const {
44-
return transform_;
45-
}
43+
std::shared_ptr<Transform> const& PartitionField::transform() const { return transform_; }
4644

4745
std::string PartitionField::ToString() const {
4846
return std::format("{} ({} {}({}))", name_, field_id_, *transform_, source_id_);

src/iceberg/partition_field.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
4343
/// \param[in] name The partition field name.
4444
/// \param[in] transform The transform function.
4545
PartitionField(int32_t source_id, int32_t field_id, std::string name,
46-
std::shared_ptr<TransformFunction> transform);
46+
std::shared_ptr<Transform> transform);
4747

4848
/// \brief Get the source field ID.
4949
int32_t source_id() const;
@@ -55,7 +55,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
5555
std::string_view name() const;
5656

5757
/// \brief Get the transform type.
58-
std::shared_ptr<TransformFunction> const& transform() const;
58+
std::shared_ptr<Transform> const& transform() const;
5959

6060
std::string ToString() const override;
6161

@@ -74,7 +74,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
7474
int32_t source_id_;
7575
int32_t field_id_;
7676
std::string name_;
77-
std::shared_ptr<TransformFunction> transform_;
77+
std::shared_ptr<Transform> transform_;
7878
};
7979

8080
} // namespace iceberg

src/iceberg/sort_field.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
namespace iceberg {
2929

30-
SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
30+
SortField::SortField(int32_t source_id, std::shared_ptr<Transform> transform,
3131
SortDirection direction, NullOrder null_order)
3232
: source_id_(source_id),
3333
transform_(std::move(transform)),
@@ -36,9 +36,7 @@ SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> trans
3636

3737
int32_t SortField::source_id() const { return source_id_; }
3838

39-
std::shared_ptr<TransformFunction> const& SortField::transform() const {
40-
return transform_;
41-
}
39+
std::shared_ptr<Transform> const& SortField::transform() const { return transform_; }
4240

4341
SortDirection SortField::direction() const { return direction_; }
4442

src/iceberg/sort_field.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,14 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
9898
/// \param[in] transform The transform function.
9999
/// \param[in] direction The sort direction.
100100
/// \param[in] null_order The null order.
101-
SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
101+
SortField(int32_t source_id, std::shared_ptr<Transform> transform,
102102
SortDirection direction, NullOrder null_order);
103103

104104
/// \brief Get the source field ID.
105105
int32_t source_id() const;
106106

107107
/// \brief Get the transform type.
108-
const std::shared_ptr<TransformFunction>& transform() const;
108+
const std::shared_ptr<Transform>& transform() const;
109109

110110
/// \brief Get the sort direction.
111111
SortDirection direction() const;
@@ -128,7 +128,7 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
128128
[[nodiscard]] bool Equals(const SortField& other) const;
129129

130130
int32_t source_id_;
131-
std::shared_ptr<TransformFunction> transform_;
131+
std::shared_ptr<Transform> transform_;
132132
SortDirection direction_;
133133
NullOrder null_order_;
134134
};

src/iceberg/transform.cc

Lines changed: 86 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -21,65 +21,113 @@
2121

2222
#include <format>
2323

24+
#include "iceberg/transform_function.h"
25+
#include "iceberg/type.h"
26+
2427
namespace iceberg {
2528

26-
namespace {
27-
/// \brief Get the relative transform name
28-
constexpr std::string_view ToString(TransformType type) {
29-
switch (type) {
30-
case TransformType::kUnknown:
31-
return "unknown";
29+
std::shared_ptr<Transform> Transform::Identity() {
30+
static auto instance = std::make_shared<Transform>(TransformType::kIdentity);
31+
return instance;
32+
}
33+
34+
Transform::Transform(TransformType transform_type) : transform_type_(transform_type) {}
35+
36+
Transform::Transform(TransformType transform_type, int32_t param)
37+
: transform_type_(transform_type), param_(param) {}
38+
39+
TransformType Transform::transform_type() const { return transform_type_; }
40+
41+
expected<std::unique_ptr<TransformFunction>, Error> Transform::Bind(
42+
const std::shared_ptr<Type>& source_type) const {
43+
auto type_str = TransformTypeToString(transform_type_);
44+
45+
switch (transform_type_) {
3246
case TransformType::kIdentity:
33-
return "identity";
34-
case TransformType::kBucket:
35-
return "bucket";
36-
case TransformType::kTruncate:
37-
return "truncate";
47+
return std::make_unique<IdentityTransform>(source_type);
48+
49+
case TransformType::kBucket: {
50+
if (auto param = std::get_if<int32_t>(&param_)) {
51+
return std::make_unique<BucketTransform>(source_type, *param);
52+
}
53+
return unexpected<Error>({
54+
.kind = ErrorKind::kInvalidArgument,
55+
.message = std::format(
56+
"Bucket requires int32 param, none found in transform '{}'", type_str),
57+
});
58+
}
59+
60+
case TransformType::kTruncate: {
61+
if (auto param = std::get_if<int32_t>(&param_)) {
62+
return std::make_unique<TruncateTransform>(source_type, *param);
63+
}
64+
return unexpected<Error>({
65+
.kind = ErrorKind::kInvalidArgument,
66+
.message = std::format(
67+
"Truncate requires int32 param, none found in transform '{}'", type_str),
68+
});
69+
}
70+
3871
case TransformType::kYear:
39-
return "year";
72+
return std::make_unique<YearTransform>(source_type);
4073
case TransformType::kMonth:
41-
return "month";
74+
return std::make_unique<MonthTransform>(source_type);
4275
case TransformType::kDay:
43-
return "day";
76+
return std::make_unique<DayTransform>(source_type);
4477
case TransformType::kHour:
45-
return "hour";
78+
return std::make_unique<HourTransform>(source_type);
4679
case TransformType::kVoid:
47-
return "void";
80+
return std::make_unique<VoidTransform>(source_type);
81+
4882
default:
49-
return "invalid";
83+
return unexpected<Error>({
84+
.kind = ErrorKind::kNotSupported,
85+
.message = std::format("Unsupported transform type: '{}'", type_str),
86+
});
5087
}
5188
}
52-
} // namespace
53-
54-
TransformFunction::TransformFunction(TransformType type) : transform_type_(type) {}
55-
56-
TransformType TransformFunction::transform_type() const { return transform_type_; }
5789

58-
std::string TransformFunction::ToString() const {
59-
return std::format("{}", iceberg::ToString(transform_type_));
90+
bool TransformFunction::Equals(const TransformFunction& other) const {
91+
return transform_type_ == other.transform_type_ && *source_type_ == *other.source_type_;
6092
}
6193

62-
bool TransformFunction::Equals(const TransformFunction& other) const {
63-
return transform_type_ == other.transform_type_;
94+
std::string Transform::ToString() const {
95+
switch (transform_type_) {
96+
case TransformType::kIdentity:
97+
case TransformType::kYear:
98+
case TransformType::kMonth:
99+
case TransformType::kDay:
100+
case TransformType::kHour:
101+
case TransformType::kVoid:
102+
case TransformType::kUnknown:
103+
return std::format("{}", TransformTypeToString(transform_type_));
104+
case TransformType::kBucket:
105+
case TransformType::kTruncate:
106+
return std::format("{}[{}]", TransformTypeToString(transform_type_),
107+
std::get<int32_t>(param_));
108+
}
64109
}
65110

66-
IdentityTransformFunction::IdentityTransformFunction()
67-
: TransformFunction(TransformType::kIdentity) {}
111+
TransformFunction::TransformFunction(TransformType transform_type,
112+
std::shared_ptr<Type> source_type)
113+
: transform_type_(transform_type), source_type_(std::move(source_type)) {}
114+
115+
TransformType TransformFunction::transform_type() const { return transform_type_; }
116+
117+
std::shared_ptr<Type> const& TransformFunction::source_type() const {
118+
return source_type_;
119+
}
68120

69-
expected<ArrowArray, Error> IdentityTransformFunction::Transform(
70-
const ArrowArray& input) {
71-
return unexpected<Error>({.kind = ErrorKind::kNotSupported,
72-
.message = "IdentityTransformFunction::Transform"});
121+
bool Transform::Equals(const Transform& other) const {
122+
return transform_type_ == other.transform_type_ && param_ == other.param_;
73123
}
74124

75-
expected<std::unique_ptr<TransformFunction>, Error> TransformFunctionFromString(
76-
std::string_view str) {
125+
expected<std::unique_ptr<Transform>, Error> TransformFromString(std::string_view str) {
77126
if (str == "identity") {
78-
return std::make_unique<IdentityTransformFunction>();
127+
return std::make_unique<Transform>(TransformType::kIdentity);
79128
}
80-
return unexpected<Error>(
81-
{.kind = ErrorKind::kInvalidArgument,
82-
.message = "Invalid TransformFunction string: " + std::string(str)});
129+
return unexpected<Error>({.kind = ErrorKind::kInvalidArgument,
130+
.message = std::format("Invalid Transform string: {}", str)});
83131
}
84132

85133
} // namespace iceberg

src/iceberg/transform.h

Lines changed: 90 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,16 +56,98 @@ enum class TransformType {
5656
kVoid,
5757
};
5858

59+
/// \brief Get the relative transform name
60+
constexpr std::string_view TransformTypeToString(TransformType type) {
61+
switch (type) {
62+
case TransformType::kUnknown:
63+
return "unknown";
64+
case TransformType::kIdentity:
65+
return "identity";
66+
case TransformType::kBucket:
67+
return "bucket";
68+
case TransformType::kTruncate:
69+
return "truncate";
70+
case TransformType::kYear:
71+
return "year";
72+
case TransformType::kMonth:
73+
return "month";
74+
case TransformType::kDay:
75+
return "day";
76+
case TransformType::kHour:
77+
return "hour";
78+
case TransformType::kVoid:
79+
return "void";
80+
}
81+
}
82+
83+
/// \brief Represents a transform used in partitioning or sorting in Iceberg.
84+
///
85+
/// This class supports binding to a source type and instantiating the corresponding
86+
/// TransformFunction, as well as serialization-friendly introspection.
87+
class ICEBERG_EXPORT Transform : public util::Formattable {
88+
public:
89+
/// \brief Returns a shared singleton instance of the Identity transform.
90+
///
91+
/// This transform leaves values unchanged and is commonly used for direct partitioning.
92+
/// \return A shared pointer to the Identity transform.
93+
static std::shared_ptr<Transform> Identity();
94+
95+
/// \brief Constructs a Transform of the specified type (for non-parametric types).
96+
/// \param transform_type The transform type (e.g., identity, year, day).
97+
explicit Transform(TransformType transform_type);
98+
99+
/// \brief Constructs a parameterized Transform (e.g., bucket(16), truncate(4)).
100+
/// \param transform_type The transform type.
101+
/// \param param The integer parameter associated with the transform.
102+
Transform(TransformType transform_type, int32_t param);
103+
104+
/// \brief Returns the transform type.
105+
TransformType transform_type() const;
106+
107+
/// \brief Binds this transform to a source type, returning a typed TransformFunction.
108+
///
109+
/// This creates a concrete transform implementation based on the transform type and
110+
/// parameter.
111+
/// \param source_type The source column type to bind to.
112+
/// \return A TransformFunction instance wrapped in `expected`, or an error on failure.
113+
expected<std::unique_ptr<TransformFunction>, Error> Bind(
114+
const std::shared_ptr<Type>& source_type) const;
115+
116+
/// \brief Returns a string representation of this transform (e.g., "bucket[16]").
117+
std::string ToString() const override;
118+
119+
/// \brief Equality comparison.
120+
friend bool operator==(const Transform& lhs, const Transform& rhs) {
121+
return lhs.Equals(rhs);
122+
}
123+
124+
/// \brief Inequality comparison.
125+
friend bool operator!=(const Transform& lhs, const Transform& rhs) {
126+
return !(lhs == rhs);
127+
}
128+
129+
private:
130+
/// \brief Checks equality with another Transform instance.
131+
[[nodiscard]] virtual bool Equals(const Transform& other) const;
132+
133+
TransformType transform_type_;
134+
///< Optional parameter (e.g., num_buckets, width)
135+
std::variant<std::monostate, int32_t> param_;
136+
};
137+
59138
/// \brief A transform function used for partitioning.
60-
class ICEBERG_EXPORT TransformFunction : public util::Formattable {
139+
class ICEBERG_EXPORT TransformFunction {
61140
public:
62-
explicit TransformFunction(TransformType type);
141+
virtual ~TransformFunction() = default;
142+
TransformFunction(TransformType transform_type, std::shared_ptr<Type> source_type);
63143
/// \brief Transform an input array to a new array
64144
virtual expected<ArrowArray, Error> Transform(const ArrowArray& data) = 0;
65145
/// \brief Get the transform type
66-
virtual TransformType transform_type() const;
67-
68-
std::string ToString() const override;
146+
TransformType transform_type() const;
147+
/// \brief Get the source type of transform function
148+
const std::shared_ptr<Type>& source_type() const;
149+
/// \brief Get the result type of transform function
150+
virtual expected<std::shared_ptr<Type>, Error> ResultType() const = 0;
69151

70152
friend bool operator==(const TransformFunction& lhs, const TransformFunction& rhs) {
71153
return lhs.Equals(rhs);
@@ -80,16 +162,10 @@ class ICEBERG_EXPORT TransformFunction : public util::Formattable {
80162
[[nodiscard]] virtual bool Equals(const TransformFunction& other) const;
81163

82164
TransformType transform_type_;
165+
std::shared_ptr<Type> source_type_;
83166
};
84167

85-
ICEBERG_EXPORT expected<std::unique_ptr<TransformFunction>, Error>
86-
TransformFunctionFromString(std::string_view str);
87-
88-
class ICEBERG_EXPORT IdentityTransformFunction : public TransformFunction {
89-
public:
90-
IdentityTransformFunction();
91-
/// \brief Transform will take an input array and transform it into a new array.
92-
expected<ArrowArray, Error> Transform(const ArrowArray& input) override;
93-
};
168+
ICEBERG_EXPORT expected<std::unique_ptr<Transform>, Error> TransformFromString(
169+
std::string_view str);
94170

95171
} // namespace iceberg

0 commit comments

Comments
 (0)