Skip to content

Commit e6b2fc9

Browse files
committed
feat: transform function
1 parent 22adac2 commit e6b2fc9

17 files changed

+528
-142
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ set(ICEBERG_SOURCES
3131
statistics_file.cc
3232
table_metadata.cc
3333
transform.cc
34+
transform_function.cc
3435
type.cc)
3536

3637
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)

src/iceberg/json_internal.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ expected<std::unique_ptr<SortField>, Error> SortFieldFromJson(
114114
ICEBERG_ASSIGN_OR_RAISE(auto source_id, GetJsonValue<int32_t>(json, kSourceId));
115115
ICEBERG_ASSIGN_OR_RAISE(
116116
auto transform,
117-
GetJsonValue<std::string>(json, kTransform).and_then(TransformFunctionFromString));
117+
GetJsonValue<std::string>(json, kTransform).and_then(TransformFromString));
118118
ICEBERG_ASSIGN_OR_RAISE(
119119
auto direction,
120120
GetJsonValue<std::string>(json, kDirection).and_then(SortDirectionFromString));

src/iceberg/partition_field.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
namespace iceberg {
2929

3030
PartitionField::PartitionField(int32_t source_id, int32_t field_id, std::string name,
31-
std::shared_ptr<TransformFunction> transform)
31+
std::shared_ptr<Transform> transform)
3232
: source_id_(source_id),
3333
field_id_(field_id),
3434
name_(std::move(name)),
@@ -40,9 +40,7 @@ int32_t PartitionField::field_id() const { return field_id_; }
4040

4141
std::string_view PartitionField::name() const { return name_; }
4242

43-
std::shared_ptr<TransformFunction> const& PartitionField::transform() const {
44-
return transform_;
45-
}
43+
std::shared_ptr<Transform> const& PartitionField::transform() const { return transform_; }
4644

4745
std::string PartitionField::ToString() const {
4846
return std::format("{} ({} {}({}))", name_, field_id_, *transform_, source_id_);

src/iceberg/partition_field.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
4343
/// \param[in] name The partition field name.
4444
/// \param[in] transform The transform function.
4545
PartitionField(int32_t source_id, int32_t field_id, std::string name,
46-
std::shared_ptr<TransformFunction> transform);
46+
std::shared_ptr<Transform> transform);
4747

4848
/// \brief Get the source field ID.
4949
int32_t source_id() const;
@@ -55,7 +55,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
5555
std::string_view name() const;
5656

5757
/// \brief Get the transform type.
58-
std::shared_ptr<TransformFunction> const& transform() const;
58+
std::shared_ptr<Transform> const& transform() const;
5959

6060
std::string ToString() const override;
6161

@@ -74,7 +74,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
7474
int32_t source_id_;
7575
int32_t field_id_;
7676
std::string name_;
77-
std::shared_ptr<TransformFunction> transform_;
77+
std::shared_ptr<Transform> transform_;
7878
};
7979

8080
} // namespace iceberg

src/iceberg/sort_field.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
namespace iceberg {
2929

30-
SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
30+
SortField::SortField(int32_t source_id, std::shared_ptr<Transform> transform,
3131
SortDirection direction, NullOrder null_order)
3232
: source_id_(source_id),
3333
transform_(std::move(transform)),
@@ -36,9 +36,7 @@ SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> trans
3636

3737
int32_t SortField::source_id() const { return source_id_; }
3838

39-
std::shared_ptr<TransformFunction> const& SortField::transform() const {
40-
return transform_;
41-
}
39+
std::shared_ptr<Transform> const& SortField::transform() const { return transform_; }
4240

4341
SortDirection SortField::direction() const { return direction_; }
4442

src/iceberg/sort_field.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,14 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
9898
/// \param[in] transform The transform function.
9999
/// \param[in] direction The sort direction.
100100
/// \param[in] null_order The null order.
101-
SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
101+
SortField(int32_t source_id, std::shared_ptr<Transform> transform,
102102
SortDirection direction, NullOrder null_order);
103103

104104
/// \brief Get the source field ID.
105105
int32_t source_id() const;
106106

107107
/// \brief Get the transform type.
108-
const std::shared_ptr<TransformFunction>& transform() const;
108+
const std::shared_ptr<Transform>& transform() const;
109109

110110
/// \brief Get the sort direction.
111111
SortDirection direction() const;
@@ -128,7 +128,7 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
128128
[[nodiscard]] bool Equals(const SortField& other) const;
129129

130130
int32_t source_id_;
131-
std::shared_ptr<TransformFunction> transform_;
131+
std::shared_ptr<Transform> transform_;
132132
SortDirection direction_;
133133
NullOrder null_order_;
134134
};

src/iceberg/transform.cc

Lines changed: 86 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -21,65 +21,113 @@
2121

2222
#include <format>
2323

24+
#include "iceberg/transform_function.h"
25+
#include "iceberg/type.h"
26+
2427
namespace iceberg {
2528

26-
namespace {
27-
/// \brief Get the relative transform name
28-
constexpr std::string_view ToString(TransformType type) {
29-
switch (type) {
30-
case TransformType::kUnknown:
31-
return "unknown";
29+
std::shared_ptr<Transform> Transform::Identity() {
30+
static auto instance = std::make_shared<Transform>(TransformType::kIdentity);
31+
return instance;
32+
}
33+
34+
Transform::Transform(TransformType transform_type) : transform_type_(transform_type) {}
35+
36+
Transform::Transform(TransformType transform_type, int32_t param)
37+
: transform_type_(transform_type), param_(param) {}
38+
39+
TransformType Transform::transform_type() const { return transform_type_; }
40+
41+
expected<std::unique_ptr<TransformFunction>, Error> Transform::Bind(
42+
const std::shared_ptr<Type>& source_type) const {
43+
auto type_str = TransformTypeToString(transform_type_);
44+
45+
switch (transform_type_) {
3246
case TransformType::kIdentity:
33-
return "identity";
34-
case TransformType::kBucket:
35-
return "bucket";
36-
case TransformType::kTruncate:
37-
return "truncate";
47+
return std::make_unique<IdentityTransform>(source_type);
48+
49+
case TransformType::kBucket: {
50+
if (auto param = std::get_if<int32_t>(&param_)) {
51+
return std::make_unique<BucketTransform>(source_type, *param);
52+
}
53+
return unexpected<Error>({
54+
.kind = ErrorKind::kInvalidArgument,
55+
.message = std::format(
56+
"Bucket requires int32 param, none found in transform '{}'", type_str),
57+
});
58+
}
59+
60+
case TransformType::kTruncate: {
61+
if (auto param = std::get_if<int32_t>(&param_)) {
62+
return std::make_unique<TruncateTransform>(source_type, *param);
63+
}
64+
return unexpected<Error>({
65+
.kind = ErrorKind::kInvalidArgument,
66+
.message = std::format(
67+
"Truncate requires int32 param, none found in transform '{}'", type_str),
68+
});
69+
}
70+
3871
case TransformType::kYear:
39-
return "year";
72+
return std::make_unique<YearTransform>(source_type);
4073
case TransformType::kMonth:
41-
return "month";
74+
return std::make_unique<MonthTransform>(source_type);
4275
case TransformType::kDay:
43-
return "day";
76+
return std::make_unique<DayTransform>(source_type);
4477
case TransformType::kHour:
45-
return "hour";
78+
return std::make_unique<HourTransform>(source_type);
4679
case TransformType::kVoid:
47-
return "void";
80+
return std::make_unique<VoidTransform>(source_type);
81+
4882
default:
49-
return "invalid";
83+
return unexpected<Error>({
84+
.kind = ErrorKind::kNotSupported,
85+
.message = std::format("Unsupported transform type: '{}'", type_str),
86+
});
5087
}
5188
}
52-
} // namespace
53-
54-
TransformFunction::TransformFunction(TransformType type) : transform_type_(type) {}
55-
56-
TransformType TransformFunction::transform_type() const { return transform_type_; }
5789

58-
std::string TransformFunction::ToString() const {
59-
return std::format("{}", iceberg::ToString(transform_type_));
90+
bool TransformFunction::Equals(const TransformFunction& other) const {
91+
return transform_type_ == other.transform_type_ && *source_type_ == *other.source_type_;
6092
}
6193

62-
bool TransformFunction::Equals(const TransformFunction& other) const {
63-
return transform_type_ == other.transform_type_;
94+
std::string Transform::ToString() const {
95+
switch (transform_type_) {
96+
case TransformType::kIdentity:
97+
case TransformType::kYear:
98+
case TransformType::kMonth:
99+
case TransformType::kDay:
100+
case TransformType::kHour:
101+
case TransformType::kVoid:
102+
case TransformType::kUnknown:
103+
return std::format("{}", TransformTypeToString(transform_type_));
104+
case TransformType::kBucket:
105+
case TransformType::kTruncate:
106+
return std::format("{}[{}]", TransformTypeToString(transform_type_),
107+
std::get<int32_t>(param_));
108+
}
64109
}
65110

66-
IdentityTransformFunction::IdentityTransformFunction()
67-
: TransformFunction(TransformType::kIdentity) {}
111+
TransformFunction::TransformFunction(TransformType transform_type,
112+
std::shared_ptr<Type> source_type)
113+
: transform_type_(transform_type), source_type_(std::move(source_type)) {}
114+
115+
TransformType TransformFunction::transform_type() const { return transform_type_; }
116+
117+
std::shared_ptr<Type> const& TransformFunction::source_type() const {
118+
return source_type_;
119+
}
68120

69-
expected<ArrowArray, Error> IdentityTransformFunction::Transform(
70-
const ArrowArray& input) {
71-
return unexpected<Error>({.kind = ErrorKind::kNotSupported,
72-
.message = "IdentityTransformFunction::Transform"});
121+
bool Transform::Equals(const Transform& other) const {
122+
return transform_type_ == other.transform_type_ && param_ == other.param_;
73123
}
74124

75-
expected<std::unique_ptr<TransformFunction>, Error> TransformFunctionFromString(
76-
std::string_view str) {
125+
expected<std::unique_ptr<Transform>, Error> TransformFromString(std::string_view str) {
77126
if (str == "identity") {
78-
return std::make_unique<IdentityTransformFunction>();
127+
return std::make_unique<Transform>(TransformType::kIdentity);
79128
}
80-
return unexpected<Error>(
81-
{.kind = ErrorKind::kInvalidArgument,
82-
.message = "Invalid TransformFunction string: " + std::string(str)});
129+
return unexpected<Error>({.kind = ErrorKind::kInvalidArgument,
130+
.message = std::format("Invalid Transform string: {}", str)});
83131
}
84132

85133
} // namespace iceberg

src/iceberg/transform.h

Lines changed: 91 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include <cstdint>
2525
#include <memory>
26+
#include <variant>
2627

2728
#include "iceberg/arrow_c_data.h"
2829
#include "iceberg/error.h"
@@ -56,16 +57,98 @@ enum class TransformType {
5657
kVoid,
5758
};
5859

60+
/// \brief Get the relative transform name
61+
constexpr std::string_view TransformTypeToString(TransformType type) {
62+
switch (type) {
63+
case TransformType::kUnknown:
64+
return "unknown";
65+
case TransformType::kIdentity:
66+
return "identity";
67+
case TransformType::kBucket:
68+
return "bucket";
69+
case TransformType::kTruncate:
70+
return "truncate";
71+
case TransformType::kYear:
72+
return "year";
73+
case TransformType::kMonth:
74+
return "month";
75+
case TransformType::kDay:
76+
return "day";
77+
case TransformType::kHour:
78+
return "hour";
79+
case TransformType::kVoid:
80+
return "void";
81+
}
82+
}
83+
84+
/// \brief Represents a transform used in partitioning or sorting in Iceberg.
85+
///
86+
/// This class supports binding to a source type and instantiating the corresponding
87+
/// TransformFunction, as well as serialization-friendly introspection.
88+
class ICEBERG_EXPORT Transform : public util::Formattable {
89+
public:
90+
/// \brief Returns a shared singleton instance of the Identity transform.
91+
///
92+
/// This transform leaves values unchanged and is commonly used for direct partitioning.
93+
/// \return A shared pointer to the Identity transform.
94+
static std::shared_ptr<Transform> Identity();
95+
96+
/// \brief Constructs a Transform of the specified type (for non-parametric types).
97+
/// \param transform_type The transform type (e.g., identity, year, day).
98+
explicit Transform(TransformType transform_type);
99+
100+
/// \brief Constructs a parameterized Transform (e.g., bucket(16), truncate(4)).
101+
/// \param transform_type The transform type.
102+
/// \param param The integer parameter associated with the transform.
103+
Transform(TransformType transform_type, int32_t param);
104+
105+
/// \brief Returns the transform type.
106+
TransformType transform_type() const;
107+
108+
/// \brief Binds this transform to a source type, returning a typed TransformFunction.
109+
///
110+
/// This creates a concrete transform implementation based on the transform type and
111+
/// parameter.
112+
/// \param source_type The source column type to bind to.
113+
/// \return A TransformFunction instance wrapped in `expected`, or an error on failure.
114+
expected<std::unique_ptr<TransformFunction>, Error> Bind(
115+
const std::shared_ptr<Type>& source_type) const;
116+
117+
/// \brief Returns a string representation of this transform (e.g., "bucket[16]").
118+
std::string ToString() const override;
119+
120+
/// \brief Equality comparison.
121+
friend bool operator==(const Transform& lhs, const Transform& rhs) {
122+
return lhs.Equals(rhs);
123+
}
124+
125+
/// \brief Inequality comparison.
126+
friend bool operator!=(const Transform& lhs, const Transform& rhs) {
127+
return !(lhs == rhs);
128+
}
129+
130+
private:
131+
/// \brief Checks equality with another Transform instance.
132+
[[nodiscard]] virtual bool Equals(const Transform& other) const;
133+
134+
TransformType transform_type_;
135+
///< Optional parameter (e.g., num_buckets, width)
136+
std::variant<std::monostate, int32_t> param_;
137+
};
138+
59139
/// \brief A transform function used for partitioning.
60-
class ICEBERG_EXPORT TransformFunction : public util::Formattable {
140+
class ICEBERG_EXPORT TransformFunction {
61141
public:
62-
explicit TransformFunction(TransformType type);
142+
virtual ~TransformFunction() = default;
143+
TransformFunction(TransformType transform_type, std::shared_ptr<Type> source_type);
63144
/// \brief Transform an input array to a new array
64145
virtual expected<ArrowArray, Error> Transform(const ArrowArray& data) = 0;
65146
/// \brief Get the transform type
66-
virtual TransformType transform_type() const;
67-
68-
std::string ToString() const override;
147+
TransformType transform_type() const;
148+
/// \brief Get the source type of transform function
149+
const std::shared_ptr<Type>& source_type() const;
150+
/// \brief Get the result type of transform function
151+
virtual expected<std::shared_ptr<Type>, Error> ResultType() const = 0;
69152

70153
friend bool operator==(const TransformFunction& lhs, const TransformFunction& rhs) {
71154
return lhs.Equals(rhs);
@@ -80,16 +163,10 @@ class ICEBERG_EXPORT TransformFunction : public util::Formattable {
80163
[[nodiscard]] virtual bool Equals(const TransformFunction& other) const;
81164

82165
TransformType transform_type_;
166+
std::shared_ptr<Type> source_type_;
83167
};
84168

85-
ICEBERG_EXPORT expected<std::unique_ptr<TransformFunction>, Error>
86-
TransformFunctionFromString(std::string_view str);
87-
88-
class ICEBERG_EXPORT IdentityTransformFunction : public TransformFunction {
89-
public:
90-
IdentityTransformFunction();
91-
/// \brief Transform will take an input array and transform it into a new array.
92-
expected<ArrowArray, Error> Transform(const ArrowArray& input) override;
93-
};
169+
ICEBERG_EXPORT expected<std::unique_ptr<Transform>, Error> TransformFromString(
170+
std::string_view str);
94171

95172
} // namespace iceberg

0 commit comments

Comments
 (0)