Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ set(ICEBERG_SOURCES
statistics_file.cc
table_metadata.cc
transform.cc
transform_function.cc
type.cc)

set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
Expand Down
4 changes: 2 additions & 2 deletions src/iceberg/json_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ Result<std::unique_ptr<SortField>> SortFieldFromJson(const nlohmann::json& json)
ICEBERG_ASSIGN_OR_RAISE(auto source_id, GetJsonValue<int32_t>(json, kSourceId));
ICEBERG_ASSIGN_OR_RAISE(
auto transform,
GetJsonValue<std::string>(json, kTransform).and_then(TransformFunctionFromString));
GetJsonValue<std::string>(json, kTransform).and_then(TransformFromString));
ICEBERG_ASSIGN_OR_RAISE(
auto direction,
GetJsonValue<std::string>(json, kDirection).and_then(SortDirectionFromString));
Expand Down Expand Up @@ -401,7 +401,7 @@ Result<std::unique_ptr<PartitionField>> PartitionFieldFromJson(
ICEBERG_ASSIGN_OR_RAISE(auto field_id, GetJsonValue<int32_t>(json, kFieldId));
ICEBERG_ASSIGN_OR_RAISE(
auto transform,
GetJsonValue<std::string>(json, kTransform).and_then(TransformFunctionFromString));
GetJsonValue<std::string>(json, kTransform).and_then(TransformFromString));
ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue<std::string>(json, kName));
return std::make_unique<PartitionField>(source_id, field_id, name,
std::move(transform));
Expand Down
6 changes: 2 additions & 4 deletions src/iceberg/partition_field.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
namespace iceberg {

PartitionField::PartitionField(int32_t source_id, int32_t field_id, std::string name,
std::shared_ptr<TransformFunction> transform)
std::shared_ptr<Transform> transform)
: source_id_(source_id),
field_id_(field_id),
name_(std::move(name)),
Expand All @@ -40,9 +40,7 @@ int32_t PartitionField::field_id() const { return field_id_; }

std::string_view PartitionField::name() const { return name_; }

std::shared_ptr<TransformFunction> const& PartitionField::transform() const {
return transform_;
}
std::shared_ptr<Transform> const& PartitionField::transform() const { return transform_; }

std::string PartitionField::ToString() const {
return std::format("{} ({} {}({}))", name_, field_id_, *transform_, source_id_);
Expand Down
6 changes: 3 additions & 3 deletions src/iceberg/partition_field.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
/// \param[in] name The partition field name.
/// \param[in] transform The transform function.
PartitionField(int32_t source_id, int32_t field_id, std::string name,
std::shared_ptr<TransformFunction> transform);
std::shared_ptr<Transform> transform);

/// \brief Get the source field ID.
int32_t source_id() const;
Expand All @@ -55,7 +55,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
std::string_view name() const;

/// \brief Get the transform type.
std::shared_ptr<TransformFunction> const& transform() const;
std::shared_ptr<Transform> const& transform() const;

std::string ToString() const override;

Expand All @@ -74,7 +74,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
int32_t source_id_;
int32_t field_id_;
std::string name_;
std::shared_ptr<TransformFunction> transform_;
std::shared_ptr<Transform> transform_;
};

} // namespace iceberg
6 changes: 2 additions & 4 deletions src/iceberg/sort_field.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

namespace iceberg {

SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
SortField::SortField(int32_t source_id, std::shared_ptr<Transform> transform,
SortDirection direction, NullOrder null_order)
: source_id_(source_id),
transform_(std::move(transform)),
Expand All @@ -36,9 +36,7 @@ SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> trans

int32_t SortField::source_id() const { return source_id_; }

std::shared_ptr<TransformFunction> const& SortField::transform() const {
return transform_;
}
std::shared_ptr<Transform> const& SortField::transform() const { return transform_; }

SortDirection SortField::direction() const { return direction_; }

Expand Down
6 changes: 3 additions & 3 deletions src/iceberg/sort_field.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,14 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
/// \param[in] transform The transform function.
/// \param[in] direction The sort direction.
/// \param[in] null_order The null order.
SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
SortField(int32_t source_id, std::shared_ptr<Transform> transform,
SortDirection direction, NullOrder null_order);

/// \brief Get the source field ID.
int32_t source_id() const;

/// \brief Get the transform type.
const std::shared_ptr<TransformFunction>& transform() const;
const std::shared_ptr<Transform>& transform() const;

/// \brief Get the sort direction.
SortDirection direction() const;
Expand All @@ -127,7 +127,7 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
[[nodiscard]] bool Equals(const SortField& other) const;

int32_t source_id_;
std::shared_ptr<TransformFunction> transform_;
std::shared_ptr<Transform> transform_;
SortDirection direction_;
NullOrder null_order_;
};
Expand Down
206 changes: 173 additions & 33 deletions src/iceberg/transform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,66 +20,206 @@
#include "iceberg/transform.h"

#include <format>
#include <regex>

namespace iceberg {
#include "iceberg/transform_function.h"
#include "iceberg/type.h"

namespace iceberg {
namespace {
/// \brief Get the relative transform name
constexpr std::string_view ToString(TransformType type) {
constexpr std::string_view kUnknownName = "unknown";
constexpr std::string_view kIdentityName = "identity";
constexpr std::string_view kBucketName = "bucket";
constexpr std::string_view kTruncateName = "truncate";
constexpr std::string_view kYearName = "year";
constexpr std::string_view kMonthName = "month";
constexpr std::string_view kDayName = "day";
constexpr std::string_view kHourName = "hour";
constexpr std::string_view kVoidName = "void";
} // namespace

constexpr std::string_view TransformTypeToString(TransformType type) {
switch (type) {
case TransformType::kUnknown:
return "unknown";
return kUnknownName;
case TransformType::kIdentity:
return "identity";
return kIdentityName;
case TransformType::kBucket:
return "bucket";
return kBucketName;
case TransformType::kTruncate:
return "truncate";
return kTruncateName;
case TransformType::kYear:
return "year";
return kYearName;
case TransformType::kMonth:
return "month";
return kMonthName;
case TransformType::kDay:
return "day";
return kDayName;
case TransformType::kHour:
return "hour";
return kHourName;
case TransformType::kVoid:
return "void";
default:
return "invalid";
return kVoidName;
}
}
} // namespace

TransformFunction::TransformFunction(TransformType type) : transform_type_(type) {}
std::shared_ptr<Transform> Transform::Identity() {
static auto instance =
std::shared_ptr<Transform>(new Transform(TransformType::kIdentity));
return instance;
}

std::shared_ptr<Transform> Transform::Year() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kYear));
return instance;
}

TransformType TransformFunction::transform_type() const { return transform_type_; }
std::shared_ptr<Transform> Transform::Month() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kMonth));
return instance;
}

std::shared_ptr<Transform> Transform::Day() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kDay));
return instance;
}

std::shared_ptr<Transform> Transform::Hour() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kHour));
return instance;
}

std::shared_ptr<Transform> Transform::Void() {
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kVoid));
return instance;
}

std::shared_ptr<Transform> Transform::Bucket(int32_t num_buckets) {
return std::shared_ptr<Transform>(new Transform(TransformType::kBucket, num_buckets));
}

std::shared_ptr<Transform> Transform::Truncate(int32_t width) {
return std::shared_ptr<Transform>(new Transform(TransformType::kTruncate, width));
}

Transform::Transform(TransformType transform_type) : transform_type_(transform_type) {}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Throw for parameterized transform_type?


Transform::Transform(TransformType transform_type, int32_t param)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Throw for non-parameterized transform_type?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to avoid invalid inputs. Perhaps we should define a separate static functions to create each transform type?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, currently I only added Transform::Identity, I will add the others as well.

: transform_type_(transform_type), param_(param) {}

TransformType Transform::transform_type() const { return transform_type_; }

Result<std::unique_ptr<TransformFunction>> Transform::Bind(
const std::shared_ptr<Type>& source_type) const {
auto type_str = TransformTypeToString(transform_type_);

switch (transform_type_) {
case TransformType::kIdentity:
return std::make_unique<IdentityTransform>(source_type);

case TransformType::kBucket: {
if (auto param = std::get_if<int32_t>(&param_)) {
return std::make_unique<BucketTransform>(source_type, *param);
}
return unexpected<Error>({
.kind = ErrorKind::kInvalidArgument,
.message = std::format(
"Bucket requires int32 param, none found in transform '{}'", type_str),
});
}

std::string TransformFunction::ToString() const {
return std::format("{}", iceberg::ToString(transform_type_));
case TransformType::kTruncate: {
if (auto param = std::get_if<int32_t>(&param_)) {
return std::make_unique<TruncateTransform>(source_type, *param);
}
return unexpected<Error>({
.kind = ErrorKind::kInvalidArgument,
.message = std::format(
"Truncate requires int32 param, none found in transform '{}'", type_str),
});
}

case TransformType::kYear:
return std::make_unique<YearTransform>(source_type);
case TransformType::kMonth:
return std::make_unique<MonthTransform>(source_type);
case TransformType::kDay:
return std::make_unique<DayTransform>(source_type);
case TransformType::kHour:
return std::make_unique<HourTransform>(source_type);
case TransformType::kVoid:
return std::make_unique<VoidTransform>(source_type);

default:
return unexpected<Error>({
.kind = ErrorKind::kNotSupported,
.message = std::format("Unsupported transform type: '{}'", type_str),
});
}
}

bool TransformFunction::Equals(const TransformFunction& other) const {
return transform_type_ == other.transform_type_;
return transform_type_ == other.transform_type_ && *source_type_ == *other.source_type_;
}

std::string Transform::ToString() const {
switch (transform_type_) {
case TransformType::kIdentity:
case TransformType::kYear:
case TransformType::kMonth:
case TransformType::kDay:
case TransformType::kHour:
case TransformType::kVoid:
case TransformType::kUnknown:
return std::format("{}", TransformTypeToString(transform_type_));
case TransformType::kBucket:
case TransformType::kTruncate:
return std::format("{}[{}]", TransformTypeToString(transform_type_),
std::get<int32_t>(param_));
}
}

IdentityTransformFunction::IdentityTransformFunction()
: TransformFunction(TransformType::kIdentity) {}
TransformFunction::TransformFunction(TransformType transform_type,
std::shared_ptr<Type> source_type)
: transform_type_(transform_type), source_type_(std::move(source_type)) {}

expected<ArrowArray, Error> IdentityTransformFunction::Transform(
const ArrowArray& input) {
return unexpected<Error>({.kind = ErrorKind::kNotSupported,
.message = "IdentityTransformFunction::Transform"});
TransformType TransformFunction::transform_type() const { return transform_type_; }

std::shared_ptr<Type> const& TransformFunction::source_type() const {
return source_type_;
}

bool Transform::Equals(const Transform& other) const {
return transform_type_ == other.transform_type_ && param_ == other.param_;
}

expected<std::unique_ptr<TransformFunction>, Error> TransformFunctionFromString(
std::string_view str) {
if (str == "identity") {
return std::make_unique<IdentityTransformFunction>();
Result<std::shared_ptr<Transform>> TransformFromString(std::string_view transform_str) {
if (transform_str == kIdentityName) return Transform::Identity();
if (transform_str == kYearName) return Transform::Year();
if (transform_str == kMonthName) return Transform::Month();
if (transform_str == kDayName) return Transform::Day();
if (transform_str == kHourName) return Transform::Hour();
if (transform_str == kVoidName) return Transform::Void();

// Match bucket[16] or truncate[4]
static const std::regex param_regex(
std::format(R"(({}|{})\[(\d+)\])", kBucketName, kTruncateName));
std::string str(transform_str);
std::smatch match;
if (std::regex_match(str, match, param_regex)) {
const std::string type_str = match[1];
const int32_t param = std::stoi(match[2]);

if (type_str == kBucketName) {
return Transform::Bucket(param);
}
if (type_str == kTruncateName) {
return Transform::Truncate(param);
}
}
return unexpected<Error>(
{.kind = ErrorKind::kInvalidArgument,
.message = "Invalid TransformFunction string: " + std::string(str)});

return unexpected<Error>({
.kind = ErrorKind::kInvalidArgument,
.message = std::format("Invalid Transform string: {}", transform_str),
});
}

} // namespace iceberg
Loading
Loading