diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index e37095043..a13f095aa 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -59,11 +59,14 @@ set(ICEBERG_SOURCES transform.cc transform_function.cc type.cc + util/bucket_util.cc util/conversions.cc util/decimal.cc util/gzip_internal.cc util/murmurhash3_internal.cc + util/temporal_util.cc util/timepoint.cc + util/truncate_util.cc util/uuid.cc v1_metadata.cc v2_metadata.cc diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index 18a46c636..aea719c84 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -24,9 +24,9 @@ #include #include -#include "iceberg/type_fwd.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/conversions.h" +#include "iceberg/util/macros.h" namespace iceberg { @@ -188,11 +188,14 @@ Result LiteralCaster::CastFromString( const auto& str_val = std::get(literal.value_); switch (target_type->type_id()) { + case TypeId::kUuid: { + ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(str_val)); + return Literal::UUID(uuid); + } case TypeId::kDate: case TypeId::kTime: case TypeId::kTimestamp: case TypeId::kTimestampTz: - case TypeId::kUuid: return NotImplemented("Cast from String to {} is not implemented yet", target_type->ToString()); default: @@ -296,6 +299,10 @@ Literal Literal::Fixed(std::vector value) { return {Value{std::move(value)}, fixed(size)}; } +Literal Literal::Decimal(int128_t value, int32_t precision, int32_t scale) { + return {Value{::iceberg::Decimal(value)}, decimal(precision, scale)}; +} + Result Literal::Deserialize(std::span data, std::shared_ptr type) { return Conversions::FromBytes(std::move(type), data); @@ -385,6 +392,15 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { return CompareFloat(this_val, other_val); } + case TypeId::kDecimal: { + auto& this_val = std::get<::iceberg::Decimal>(value_); + auto& other_val = std::get<::iceberg::Decimal>(other.value_); + const auto& this_decimal_type = internal::checked_cast(*type_); + const auto& other_decimal_type = internal::checked_cast(*other.type_); + return ::iceberg::Decimal::Compare(this_val, other_val, this_decimal_type.scale(), + other_decimal_type.scale()); + } + case TypeId::kString: { auto& this_val = std::get(value_); auto& other_val = std::get(other.value_); @@ -440,6 +456,12 @@ std::string Literal::ToString() const { case TypeId::kDouble: { return std::to_string(std::get(value_)); } + case TypeId::kDecimal: { + const auto& decimal_type = internal::checked_cast(*type_); + const auto& decimal = std::get<::iceberg::Decimal>(value_); + return decimal.ToString(decimal_type.scale()) + .value_or("invalid literal of type decimal"); + } case TypeId::kString: { return "\"" + std::get(value_) + "\""; } diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 70ff2d806..13ffafe68 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -27,7 +27,9 @@ #include "iceberg/result.h" #include "iceberg/type.h" +#include "iceberg/util/decimal.h" #include "iceberg/util/formattable.h" +#include "iceberg/util/int128.h" #include "iceberg/util/uuid.h" namespace iceberg { @@ -57,9 +59,9 @@ class ICEBERG_EXPORT Literal : public util::Formattable { float, // for float double, // for double std::string, // for string - Uuid, // for uuid - std::vector, // for binary, fixed - std::array, // for decimal + std::vector, // for binary, fixed + ::iceberg::Decimal, // for decimal + Uuid, // for uuid BelowMin, AboveMax>; /// \brief Factory methods for primitive types @@ -77,6 +79,10 @@ class ICEBERG_EXPORT Literal : public util::Formattable { static Literal Binary(std::vector value); static Literal Fixed(std::vector value); + /// \brief Create a decimal literal. + /// \param value The unscaled 128-bit integer value. + static Literal Decimal(int128_t value, int32_t precision, int32_t scale); + /// \brief Create a literal representing a null value. static Literal Null(std::shared_ptr type) { return {Value{std::monostate{}}, std::move(type)}; @@ -205,6 +211,11 @@ struct LiteralTraits { using ValueType = double; }; +template <> +struct LiteralTraits { + using ValueType = Decimal; +}; + template <> struct LiteralTraits { using ValueType = std::string; diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index bc0f834e9..c2ac30e70 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -220,9 +220,12 @@ Status ManifestEntryAdapter::AppendPartitionValues( break; case TypeId::kDecimal: ICEBERG_RETURN_UNEXPECTED(AppendField( - child_array, std::get>(partition_value.value()))); + child_array, std::get(partition_value.value()).ToBytes())); break; case TypeId::kUuid: + ICEBERG_RETURN_UNEXPECTED( + AppendField(child_array, std::get(partition_value.value()).bytes())); + break; case TypeId::kStruct: case TypeId::kList: case TypeId::kMap: diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 25bfdc647..1b24f85fb 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -81,11 +81,14 @@ iceberg_sources = files( 'transform.cc', 'transform_function.cc', 'type.cc', + 'util/bucket_util.cc', 'util/conversions.cc', 'util/decimal.cc', 'util/gzip_internal.cc', 'util/murmurhash3_internal.cc', + 'util/temporal_util.cc', 'util/timepoint.cc', + 'util/truncate_util.cc', 'util/uuid.cc', 'v1_metadata.cc', 'v2_metadata.cc', diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 7c62a2a55..68af62bf1 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -99,11 +99,13 @@ add_iceberg_test(json_serde_test add_iceberg_test(util_test SOURCES + bucket_util_test.cc config_test.cc decimal_test.cc endian_test.cc formatter_test.cc string_util_test.cc + truncate_util_test.cc uuid_test.cc visit_type_test.cc) diff --git a/src/iceberg/test/bucket_util_test.cc b/src/iceberg/test/bucket_util_test.cc new file mode 100644 index 000000000..69a04ef54 --- /dev/null +++ b/src/iceberg/test/bucket_util_test.cc @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/bucket_util.h" + +#include + +#include + +#include "iceberg/util/decimal.h" +#include "iceberg/util/uuid.h" + +namespace iceberg { + +// The following tests are from +// https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements +TEST(BucketUtilsTest, HashHelper) { + // int and long + EXPECT_EQ(BucketUtils::HashInt(34), 2017239379); + EXPECT_EQ(BucketUtils::HashLong(34L), 2017239379); + + // decimal hash + auto decimal = Decimal::FromString("14.20"); + ASSERT_TRUE(decimal.has_value()); + EXPECT_EQ(BucketUtils::HashBytes(decimal->ToBigEndian()), -500754589); + + // date hash + std::chrono::sys_days sd = std::chrono::year{2017} / 11 / 16; + std::chrono::sys_days epoch{std::chrono::year{1970} / 1 / 1}; + int32_t days = (sd - epoch).count(); + EXPECT_EQ(BucketUtils::HashInt(days), -653330422); + + // time + // 22:31:08 in microseconds + int64_t time_micros = (22 * 3600 + 31 * 60 + 8) * 1000000LL; + EXPECT_EQ(BucketUtils::HashLong(time_micros), -662762989); + + // timestamp + // 2017-11-16T22:31:08 in microseconds + std::chrono::system_clock::time_point tp = + std::chrono::sys_days{std::chrono::year{2017} / 11 / 16} + std::chrono::hours{22} + + std::chrono::minutes{31} + std::chrono::seconds{8}; + int64_t timestamp_micros = + std::chrono::duration_cast(tp.time_since_epoch()) + .count(); + EXPECT_EQ(BucketUtils::HashLong(timestamp_micros), -2047944441); + // 2017-11-16T22:31:08.000001 in microseconds + EXPECT_EQ(BucketUtils::HashLong(timestamp_micros + 1), -1207196810); + + // string + std::string str = "iceberg"; + EXPECT_EQ(BucketUtils::HashBytes(std::span( + reinterpret_cast(str.data()), str.size())), + 1210000089); + + // uuid + auto uuid = Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7"); + EXPECT_EQ(BucketUtils::HashBytes(uuid->bytes()), 1488055340); + + // fixed & binary + std::vector fixed = {0, 1, 2, 3}; + EXPECT_EQ(BucketUtils::HashBytes(fixed), -188683207); +} + +} // namespace iceberg diff --git a/src/iceberg/test/decimal_test.cc b/src/iceberg/test/decimal_test.cc index 6850d7aad..71ba67417 100644 --- a/src/iceberg/test/decimal_test.cc +++ b/src/iceberg/test/decimal_test.cc @@ -490,6 +490,50 @@ TEST(DecimalTest, FromBigEndianInvalid) { IsError(ErrorKind::kInvalidArgument)); } +TEST(DecimalTest, ToBigEndian) { + std::vector high_values = {0, + 1, + -1, + INT32_MAX, + INT32_MIN, + static_cast(INT32_MAX) + 1, + static_cast(INT32_MIN) - 1, + INT64_MAX, + INT64_MIN}; + std::vector low_values = {0, + 1, + 255, + UINT32_MAX, + static_cast(UINT32_MAX) + 1, + static_cast(UINT32_MAX) + 2, + static_cast(UINT32_MAX) + 3, + static_cast(UINT32_MAX) + 4, + static_cast(UINT32_MAX) + 5, + static_cast(UINT32_MAX) + 6, + static_cast(UINT32_MAX) + 7, + static_cast(UINT32_MAX) + 8, + UINT64_MAX}; + + for (int64_t high : high_values) { + for (uint64_t low : low_values) { + Decimal decimal(high, low); + auto bytes = decimal.ToBigEndian(); + auto result = Decimal::FromBigEndian(bytes.data(), bytes.size()); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value(), decimal); + } + } + + for (int128_t value : std::vector{-INT64_MAX, -INT32_MAX, -255, -1, 0, 1, 255, + 256, INT32_MAX, INT64_MAX}) { + Decimal decimal(value); + auto bytes = decimal.ToBigEndian(); + auto result = Decimal::FromBigEndian(bytes.data(), bytes.size()); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(result.value(), decimal); + } +} + TEST(DecimalTestFunctionality, Multiply) { ASSERT_EQ(Decimal(60501), Decimal(301) * Decimal(201)); ASSERT_EQ(Decimal(-60501), Decimal(-301) * Decimal(201)); @@ -671,4 +715,58 @@ TEST(DecimalTest, Rescale) { ASSERT_THAT(Decimal(5555555).Rescale(6, 1), IsError(ErrorKind::kInvalid)); } +TEST(DecimalTest, Compare) { + // max positive unscaled value + // 10^38 - 1 scale cause overflow + ASSERT_EQ(Decimal::Compare(Decimal("99999999999999999999999999999999999999"), + Decimal("99999999999999999999999999999999999999"), 2, 3), + std::partial_ordering::greater); + // 10^37 - 1 scale no overflow + ASSERT_EQ(Decimal::Compare(Decimal("9999999999999999999999999999999999999"), + Decimal("99999999999999999999999999999999999999"), 2, 3), + std::partial_ordering::less); + + // min negative unscaled value + // -10^38 + 1 scale cause overflow + ASSERT_EQ(Decimal::Compare(Decimal("-99999999999999999999999999999999999999"), + Decimal("-99999999999999999999999999999999999999"), 2, 3), + std::partial_ordering::less); + // -10^37 + 1 scale no overflow + ASSERT_EQ(Decimal::Compare(Decimal("-9999999999999999999999999999999999999"), + Decimal("-99999999999999999999999999999999999999"), 2, 3), + std::partial_ordering::greater); + + // equal values with different scales + ASSERT_EQ(Decimal::Compare(Decimal("123456789"), Decimal("1234567890"), 2, 3), + std::partial_ordering::equivalent); + ASSERT_EQ(Decimal::Compare(Decimal("-1234567890"), Decimal("-123456789"), 3, 2), + std::partial_ordering::equivalent); + + // different values with different scales + ASSERT_EQ(Decimal::Compare(Decimal("123456788"), Decimal("1234567890"), 2, 3), + std::partial_ordering::less); + ASSERT_EQ(Decimal::Compare(Decimal("-1234567890"), Decimal("-123456788"), 2, 3), + std::partial_ordering::less); + + // different values with same scales + ASSERT_EQ(Decimal::Compare(Decimal("123456790"), Decimal("123456789"), 2, 2), + std::partial_ordering::greater); + ASSERT_EQ(Decimal::Compare(Decimal("-123456790"), Decimal("-123456789"), 2, 2), + std::partial_ordering::less); + + // different signs + ASSERT_EQ(Decimal::Compare(Decimal("123456789"), Decimal("-123456789"), 2, 3), + std::partial_ordering::greater); + ASSERT_EQ(Decimal::Compare(Decimal("-123456789"), Decimal("123456789"), 2, 3), + std::partial_ordering::less); + + // zero comparisons + ASSERT_EQ(Decimal::Compare(Decimal("0"), Decimal("0"), 2, 3), + std::partial_ordering::equivalent); + ASSERT_EQ(Decimal::Compare(Decimal("0"), Decimal("123456789"), 2, 3), + std::partial_ordering::less); + ASSERT_EQ(Decimal::Compare(Decimal("-123456789"), Decimal("0"), 2, 3), + std::partial_ordering::less); +} + } // namespace iceberg diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc index 6e4b2aa3b..0dd291d5c 100644 --- a/src/iceberg/test/literal_test.cc +++ b/src/iceberg/test/literal_test.cc @@ -256,6 +256,20 @@ TEST(LiteralTest, DoubleZeroComparison) { EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); } +TEST(LiteralTest, UuidComparison) { + auto uuid1 = Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value(); + auto uuid2 = Uuid::FromString("123e4567-e89b-12d3-a456-426614174001").value(); + auto uuid3 = Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value(); + + auto literal1 = Literal::UUID(uuid1); + auto literal2 = Literal::UUID(uuid2); + auto literal3 = Literal::UUID(uuid3); + + EXPECT_EQ(literal1 <=> literal3, std::partial_ordering::equivalent); + EXPECT_EQ(literal1 <=> literal2, std::partial_ordering::unordered); + EXPECT_EQ(literal2 <=> literal1, std::partial_ordering::unordered); +} + // Parameter struct for literal serialization and deserialization tests struct LiteralParam { std::string test_name; @@ -346,6 +360,17 @@ INSTANTIATE_TEST_SUITE_P( Literal::Double(std::numeric_limits::lowest()), float64()}, + // Decimal type + LiteralParam{"DecimalPositive", + {1, 226, 64}, + Literal::Decimal(123456, 6, 2), + decimal(6, 2)}, + LiteralParam{"DecimalNegative", + {254, 29, 192}, + Literal::Decimal(-123456, 6, 2), + decimal(6, 2)}, + LiteralParam{"DecimalZero", {0}, Literal::Decimal(0, 3, 0), decimal(3, 0)}, + LiteralParam{"String", {105, 99, 101, 98, 101, 114, 103}, Literal::String("iceberg"), @@ -506,10 +531,28 @@ INSTANTIATE_TEST_SUITE_P( .literal = Literal::Double(std::numbers::pi), .expected_type_id = TypeId::kDouble, .expected_string = "3.141593"}, + BasicLiteralTestParam{.test_name = "DecimalPositive", + .literal = Literal::Decimal(123456, 6, 2), + .expected_type_id = TypeId::kDecimal, + .expected_string = "1234.56"}, + BasicLiteralTestParam{.test_name = "DecimalNegative", + .literal = Literal::Decimal(-123456, 6, 2), + .expected_type_id = TypeId::kDecimal, + .expected_string = "-1234.56"}, + BasicLiteralTestParam{.test_name = "DecimalZero", + .literal = Literal::Decimal(0, 3, 0), + .expected_type_id = TypeId::kDecimal, + .expected_string = "0"}, BasicLiteralTestParam{.test_name = "String", .literal = Literal::String("hello world"), .expected_type_id = TypeId::kString, .expected_string = "\"hello world\""}, + BasicLiteralTestParam{ + .test_name = "Uuid", + .literal = Literal::UUID( + Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value()), + .expected_type_id = TypeId::kUuid, + .expected_string = "123e4567-e89b-12d3-a456-426614174000"}, BasicLiteralTestParam{ .test_name = "Binary", .literal = Literal::Binary(std::vector{0x01, 0x02, 0x03, 0xFF}), @@ -563,6 +606,10 @@ INSTANTIATE_TEST_SUITE_P( .small_literal = Literal::Double(1.5), .large_literal = Literal::Double(2.5), .equal_literal = Literal::Double(1.5)}, + ComparisonLiteralTestParam{.test_name = "Decimal", + .small_literal = Literal::Decimal(123456, 6, 2), + .large_literal = Literal::Decimal(234567, 6, 2), + .equal_literal = Literal::Decimal(123456, 6, 2)}, ComparisonLiteralTestParam{.test_name = "String", .small_literal = Literal::String("apple"), .large_literal = Literal::String("banana"), @@ -672,6 +719,13 @@ INSTANTIATE_TEST_SUITE_P( .target_type = fixed(4), .expected_literal = Literal::Fixed(std::vector{ 0x01, 0x02, 0x03, 0x04})}, + // String cast tests + CastLiteralTestParam{ + .test_name = "StringToUuid", + .source_literal = Literal::String("123e4567-e89b-12d3-a456-426614174000"), + .target_type = uuid(), + .expected_literal = Literal::UUID( + Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value())}, // Same type cast test CastLiteralTestParam{.test_name = "IntToInt", .source_literal = Literal::Int(42), diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index dd3bd053e..88b16325c 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -68,11 +68,13 @@ iceberg_tests = { }, 'util_test': { 'sources': files( + 'bucket_util_test.cc', 'config_test.cc', 'decimal_test.cc', 'endian_test.cc', 'formatter_test.cc', 'string_util_test.cc', + 'truncate_util_test.cc', 'uuid_test.cc', 'visit_type_test.cc', ), diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index c1efcb56a..1003b9532 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -21,11 +21,13 @@ #include #include +#include #include #include #include "iceberg/expression/literal.h" +#include "iceberg/transform_function.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "matchers.h" @@ -63,6 +65,7 @@ TEST(TransformFunctionTest, CreateTruncateTransform) { auto transformPtr = transform->Bind(iceberg::string()); EXPECT_EQ(transformPtr.value()->transform_type(), TransformType::kTruncate); } + TEST(TransformFromStringTest, PositiveCases) { struct Case { std::string str; @@ -187,373 +190,485 @@ TEST(TransformResultTypeTest, NegativeCases) { } } -TEST(TransformLiteralTest, IdentityTransform) { - struct Case { - std::shared_ptr source_type; - Literal source; - Literal expected; - }; +// Parameterized tests for transform functions +struct TransformParam { + std::string str; + // The integer parameter associated with the transform. + int32_t param; + std::shared_ptr source_type; + Literal source; + Literal expected; +}; - const std::vector cases = { - {.source_type = iceberg::boolean(), - .source = Literal::Boolean(true), - .expected = Literal::Boolean(true)}, - {.source_type = iceberg::int32(), - .source = Literal::Int(42), - .expected = Literal::Int(42)}, - {.source_type = iceberg::int32(), - .source = Literal::Date(30000), - .expected = Literal::Date(30000)}, - {.source_type = iceberg::int64(), - .source = Literal::Long(1234567890), - .expected = Literal::Long(1234567890)}, - {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000000), - .expected = Literal::Timestamp(1622547800000000)}, - {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000000), - .expected = Literal::TimestampTz(1622547800000000)}, - {.source_type = iceberg::float32(), - .source = Literal::Float(3.14), - .expected = Literal::Float(3.14)}, - {.source_type = iceberg::float64(), - .source = Literal::Double(1.23e-5), - .expected = Literal::Double(1.23e-5)}, - {.source_type = iceberg::string(), - .source = Literal::String("Hello, World!"), - .expected = Literal::String("Hello, World!")}, - {.source_type = iceberg::binary(), - .source = Literal::Binary({0x01, 0x02, 0x03}), - .expected = Literal::Binary({0x01, 0x02, 0x03})}, - }; +class TransformLiteralTest : public ::testing::TestWithParam {}; - for (const auto& c : cases) { - auto transform = Transform::Identity(); - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity transform"; +TEST_P(TransformLiteralTest, IdentityTransform) { + const auto& param = GetParam(); - auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result.has_value()) - << "Failed to transform literal: " << c.source.ToString(); + auto transform = Transform::Identity(); + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity transform"; - EXPECT_EQ(result.value(), c.expected) - << "Unexpected result for source: " << c.source.ToString(); - } + auto result = transformPtr.value()->Transform(param.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << param.source.ToString(); + + EXPECT_EQ(result.value(), param.expected) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformLiteralTest, BucketTransform) { +INSTANTIATE_TEST_SUITE_P( + IdentityTransformTests, TransformLiteralTest, + ::testing::Values( + TransformParam{.str = "BooleanTrue", + .source_type = iceberg::boolean(), + .source = Literal::Boolean(true), + .expected = Literal::Boolean(true)}, + TransformParam{.str = "BooleanFalse", + .source_type = iceberg::boolean(), + .source = Literal::Boolean(false), + .expected = Literal::Boolean(false)}, + TransformParam{.str = "Int32", + .source_type = iceberg::int32(), + .source = Literal::Int(42), + .expected = Literal::Int(42)}, + TransformParam{.str = "Date", + .source_type = iceberg::int32(), + .source = Literal::Date(30000), + .expected = Literal::Date(30000)}, + TransformParam{.str = "Int64", + .source_type = iceberg::int64(), + .source = Literal::Long(1234567890), + .expected = Literal::Long(1234567890)}, + TransformParam{.str = "Timestamp", + .source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Timestamp(1622547800000000)}, + TransformParam{.str = "TimestampTz", + .source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::TimestampTz(1622547800000000)}, + TransformParam{.str = "Float", + .source_type = iceberg::float32(), + .source = Literal::Float(3.14), + .expected = Literal::Float(3.14)}, + TransformParam{.str = "Double", + .source_type = iceberg::float64(), + .source = Literal::Double(1.23e-5), + .expected = Literal::Double(1.23e-5)}, + TransformParam{.str = "Decimal", + .source_type = iceberg::decimal(10, 2), + .source = Literal::Decimal(123456, 10, 2), + .expected = Literal::Decimal(123456, 10, 2)}, + TransformParam{.str = "String", + .source_type = iceberg::string(), + .source = Literal::String("Hello, World!"), + .expected = Literal::String("Hello, World!")}, + TransformParam{ + .str = "Uuid", + .source_type = iceberg::uuid(), + .source = Literal::UUID( + Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value()), + .expected = Literal::UUID( + Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value())}, + TransformParam{.str = "Binary", + .source_type = iceberg::binary(), + .source = Literal::Binary({0x01, 0x02, 0x03}), + .expected = Literal::Binary({0x01, 0x02, 0x03})}, + TransformParam{.str = "Fixed", + .source_type = iceberg::fixed(3), + .source = Literal::Fixed({0x01, 0x02, 0x03}), + .expected = Literal::Fixed({0x01, 0x02, 0x03})}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class BucketTransformTest : public ::testing::TestWithParam {}; + +TEST_P(BucketTransformTest, BucketTransform) { constexpr int32_t num_buckets = 4; auto transform = Transform::Bucket(num_buckets); - struct Case { - std::shared_ptr source_type; - Literal source; - Literal expected; - }; + const auto& param = GetParam(); + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform"; + auto result = transformPtr.value()->Transform(param.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << param.source.ToString(); - const std::vector cases = { - {.source_type = iceberg::int32(), - .source = Literal::Int(42), - .expected = Literal::Int(3)}, - {.source_type = iceberg::date(), - .source = Literal::Date(30000), - .expected = Literal::Int(2)}, - {.source_type = iceberg::int64(), - .source = Literal::Long(1234567890), - .expected = Literal::Int(3)}, - {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000000), - .expected = Literal::Int(1)}, - {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000000), - .expected = Literal::Int(1)}, - {.source_type = iceberg::string(), - .source = Literal::String("test"), - .expected = Literal::Int(3)}, - }; - - for (const auto& c : cases) { - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform"; - auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result.has_value()) - << "Failed to transform literal: " << c.source.ToString(); - - EXPECT_EQ(result.value(), c.expected) - << "Unexpected result for source: " << c.source.ToString(); - } + EXPECT_EQ(result.value(), param.expected) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformLiteralTest, TruncateTransform) { - struct Case { - std::shared_ptr source_type; - int32_t width; - Literal source; - Literal expected; - }; - - const std::vector cases = { - {.source_type = iceberg::int32(), - .width = 5, - .source = Literal::Int(123456), - .expected = Literal::Int(123455)}, - {.source_type = iceberg::string(), - .width = 5, - .source = Literal::String("Hello, World!"), - .expected = Literal::String("Hello")}, - {.source_type = iceberg::string(), - .width = 5, - .source = Literal::String("😜🧐🤔🤪🥳😵‍💫😂"), - // Truncate to 5 utf-8 code points - .expected = Literal::String("😜🧐🤔🤪🥳")}, - {.source_type = iceberg::string(), - .width = 8, - .source = Literal::String("a😜b🧐c🤔d🤪e🥳"), - .expected = Literal::String("a😜b🧐c🤔d🤪")}, - {.source_type = iceberg::binary(), - .width = 5, - .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}), - .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})}, - }; - - for (const auto& c : cases) { - auto transform = Transform::Truncate(c.width); - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate transform"; - auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result.has_value()) - << "Failed to transform literal: " << c.source.ToString(); - - EXPECT_EQ(result.value(), c.expected) - << "Unexpected result for source: " << c.source.ToString(); - } +INSTANTIATE_TEST_SUITE_P( + BucketTransformTests, BucketTransformTest, + ::testing::Values( + TransformParam{.str = "Int32", + .source_type = iceberg::int32(), + .source = Literal::Int(34), + .expected = Literal::Int(3)}, + TransformParam{.str = "Int64", + .source_type = iceberg::int64(), + .source = Literal::Long(34), + .expected = Literal::Int(3)}, + TransformParam{.str = "Decimal", + // 14.20 + .source_type = iceberg::decimal(4, 2), + .source = Literal::Decimal(1420, 4, 2), + .expected = Literal::Int(3)}, + TransformParam{.str = "Date", + // 2017-11-16 + .source_type = iceberg::date(), + .source = Literal::Date(17486), + .expected = Literal::Int(2)}, + TransformParam{.str = "Time", + // 22:31:08 in microseconds + .source_type = iceberg::time(), + .source = Literal::Time(81068000000), + .expected = Literal::Int(3)}, + TransformParam{.str = "Timestamp", + // 2017-11-16T22:31:08 in microseconds + .source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1510871468000000), + .expected = Literal::Int(3)}, + TransformParam{.str = "TimestampTz", + // 2017-11-16T22:31:08.000001 in microseconds + .source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1510871468000001), + .expected = Literal::Int(2)}, + TransformParam{.str = "String", + .source_type = iceberg::string(), + .source = Literal::String("iceberg"), + .expected = Literal::Int(1)}, + TransformParam{ + .str = "Uuid", + .source_type = iceberg::uuid(), + .source = Literal::UUID( + Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7").value()), + .expected = Literal::Int(0)}, + TransformParam{.str = "Fixed", + .source_type = iceberg::fixed(4), + .source = Literal::Fixed({0, 1, 2, 3}), + .expected = Literal::Int(1)}, + TransformParam{.str = "Binary", + .source_type = iceberg::binary(), + .source = Literal::Binary({0, 1, 2, 3}), + .expected = Literal::Int(1)}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class TruncateTransformTest : public ::testing::TestWithParam {}; + +TEST_P(TruncateTransformTest, TruncateTransform) { + const auto& param = GetParam(); + auto transform = Transform::Truncate(param.param); + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind truncate transform"; + auto result = transformPtr.value()->Transform(param.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << param.source.ToString(); + + EXPECT_EQ(result.value(), param.expected) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformLiteralTest, YearTransform) { +INSTANTIATE_TEST_SUITE_P( + TruncateTransformTests, TruncateTransformTest, + ::testing::Values( + TransformParam{.str = "Int32", + .param = 5, + .source_type = iceberg::int32(), + .source = Literal::Int(123456), + .expected = Literal::Int(123455)}, + TransformParam{.str = "Int64", + .param = 10, + .source_type = iceberg::int64(), + .source = Literal::Long(-1), + .expected = Literal::Long(-10)}, + TransformParam{.str = "Decimal", + .param = 50, + .source_type = iceberg::decimal(5, 2), + .source = Literal::Decimal(12345, 5, 2), + .expected = Literal::Decimal(12300, 5, 2)}, + TransformParam{.str = "StringShort", + .param = 5, + .source_type = iceberg::string(), + .source = Literal::String("Hello, World!"), + .expected = Literal::String("Hello")}, + TransformParam{.str = "StringEmoji", + .param = 5, + .source_type = iceberg::string(), + .source = Literal::String("😜🧐🤔🤪🥳😵‍💫😂"), + .expected = Literal::String("😜🧐🤔🤪🥳")}, + TransformParam{.str = "StringMixed", + .param = 8, + .source_type = iceberg::string(), + .source = Literal::String("a😜b🧐c🤔d🤪e🥳"), + .expected = Literal::String("a😜b🧐c🤔d🤪")}, + TransformParam{.str = "Binary", + .param = 5, + .source_type = iceberg::binary(), + .source = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05, 0x06}), + .expected = Literal::Binary({0x01, 0x02, 0x03, 0x04, 0x05})}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class YearTransformTest : public ::testing::TestWithParam {}; + +TEST_P(YearTransformTest, YearTransform) { auto transform = Transform::Year(); + const auto& param = GetParam(); - struct Case { - std::shared_ptr source_type; - Literal source; - Literal expected; - }; + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind year transform"; - const std::vector cases = { - {.source_type = iceberg::timestamp(), - // 2021-06-01T11:43:20Z - .source = Literal::Timestamp(1622547800000000), - .expected = Literal::Int(2021)}, - {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000000), - .expected = Literal::Int(2021)}, - {.source_type = iceberg::date(), - .source = Literal::Date(30000), - .expected = Literal::Int(2052)}, - }; + auto result = transformPtr.value()->Transform(param.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << param.source.ToString(); - for (const auto& c : cases) { - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind year transform"; - auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result.has_value()) - << "Failed to transform literal: " << c.source.ToString(); - - EXPECT_EQ(result.value(), c.expected) - << "Unexpected result for source: " << c.source.ToString(); - } + EXPECT_EQ(result.value(), param.expected) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformLiteralTest, MonthTransform) { +INSTANTIATE_TEST_SUITE_P( + YearTransformTests, YearTransformTest, + ::testing::Values(TransformParam{.str = "Timestamp", + // 2021-06-01T11:43:20Z + .source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(2021)}, + TransformParam{.str = "TimestampTz", + .source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(2021)}, + TransformParam{.str = "Date", + .source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(2052)}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class MonthTransformTest : public ::testing::TestWithParam {}; + +TEST_P(MonthTransformTest, MonthTransform) { auto transform = Transform::Month(); + const auto& param = GetParam(); - struct Case { - std::shared_ptr source_type; - Literal source; - Literal expected; - }; - - const std::vector cases = { - {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000000), - .expected = Literal::Int(617)}, - {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000000), - .expected = Literal::Int(617)}, - {.source_type = iceberg::date(), - .source = Literal::Date(30000), - .expected = Literal::Int(985)}, - }; + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind month transform"; - for (const auto& c : cases) { - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind month transform"; - auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result.has_value()) - << "Failed to transform literal: " << c.source.ToString(); + auto result = transformPtr.value()->Transform(param.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << param.source.ToString(); - EXPECT_EQ(result.value(), c.expected) - << "Unexpected result for source: " << c.source.ToString(); - } + EXPECT_EQ(result.value(), param.expected) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformFunctionTransformTest, DayTransform) { +INSTANTIATE_TEST_SUITE_P( + MonthTransformTests, MonthTransformTest, + ::testing::Values(TransformParam{.str = "Timestamp", + .source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(617)}, + TransformParam{.str = "TimestampTz", + .source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(617)}, + TransformParam{.str = "Date", + .source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(985)}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class DayTransformTest : public ::testing::TestWithParam {}; + +TEST_P(DayTransformTest, DayTransform) { auto transform = Transform::Day(); + const auto& param = GetParam(); - struct Case { - std::shared_ptr source_type; - Literal source; - Literal expected; - }; + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind day transform"; - const std::vector cases = { - {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000000), - .expected = Literal::Int(18779)}, - {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000000), - .expected = Literal::Int(18779)}, - {.source_type = iceberg::date(), - .source = Literal::Date(30000), - .expected = Literal::Int(30000)}, - }; - - for (const auto& c : cases) { - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind day transform"; - auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result.has_value()) - << "Failed to transform literal: " << c.source.ToString(); + auto result = transformPtr.value()->Transform(param.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << param.source.ToString(); - EXPECT_EQ(result.value(), c.expected) - << "Unexpected result for source: " << c.source.ToString(); - } + EXPECT_EQ(result.value(), param.expected) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformLiteralTest, HourTransform) { +INSTANTIATE_TEST_SUITE_P( + DayTransformTests, DayTransformTest, + ::testing::Values(TransformParam{.str = "Timestamp", + .source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(18779)}, + TransformParam{.str = "TimestampTz", + .source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(18779)}, + TransformParam{.str = "Date", + .source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Int(30000)}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class HourTransformTest : public ::testing::TestWithParam {}; + +TEST_P(HourTransformTest, HourTransform) { auto transform = Transform::Hour(); + const auto& param = GetParam(); - struct Case { - std::shared_ptr source_type; - Literal source; - Literal expected; - }; - - const std::vector cases = { - {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000000), - .expected = Literal::Int(450707)}, - {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000000), - .expected = Literal::Int(450707)}, - }; + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind hour transform"; - for (const auto& c : cases) { - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind hour transform"; - auto result = transformPtr.value()->Transform(c.source); - ASSERT_TRUE(result.has_value()) - << "Failed to transform literal: " << c.source.ToString(); + auto result = transformPtr.value()->Transform(param.source); + ASSERT_TRUE(result.has_value()) + << "Failed to transform literal: " << param.source.ToString(); - EXPECT_EQ(result.value(), c.expected) - << "Unexpected result for source: " << c.source.ToString(); - } + EXPECT_EQ(result.value(), param.expected) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformLiteralTest, VoidTransform) { +INSTANTIATE_TEST_SUITE_P( + HourTransformTests, HourTransformTest, + ::testing::Values(TransformParam{.str = "Timestamp", + .source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Int(450707)}, + TransformParam{.str = "TimestampTz", + .source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Int(450707)}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class VoidTransformTest : public ::testing::TestWithParam {}; + +TEST_P(VoidTransformTest, VoidTransform) { auto transform = Transform::Void(); - - struct Case { - std::shared_ptr source_type; - Literal source; - }; - - const std::vector cases = { - {.source_type = iceberg::boolean(), .source = Literal::Boolean(true)}, - {.source_type = iceberg::int32(), .source = Literal::Int(42)}, - {.source_type = iceberg::date(), .source = Literal::Date(30000)}, - {.source_type = iceberg::int64(), .source = Literal::Long(1234567890)}, - {.source_type = iceberg::timestamp(), - .source = Literal::Timestamp(1622547800000000)}, - {.source_type = iceberg::timestamp_tz(), - .source = Literal::TimestampTz(1622547800000000)}, - {.source_type = iceberg::float32(), .source = Literal::Float(3.14)}, - {.source_type = iceberg::float64(), .source = Literal::Double(1.23e-5)}, - {.source_type = iceberg::string(), .source = Literal::String("Hello, World!")}, - {.source_type = iceberg::binary(), .source = Literal::Binary({0x01, 0x02, 0x03})}, - }; - - for (const auto& c : cases) { - auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform"; - auto result = transformPtr.value()->Transform(c.source); - EXPECT_TRUE(result->IsNull()) - << "Expected void transform to return null type for source: " - << c.source.ToString(); - EXPECT_EQ(result->type()->type_id(), c.source_type->type_id()) - << "Expected void transform to return same type as source for: " - << c.source.ToString(); - } + const auto& param = GetParam(); + + auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind void transform"; + + auto result = transformPtr.value()->Transform(param.source); + EXPECT_TRUE(result->IsNull()) + << "Expected void transform to return null type for source: " + << param.source.ToString(); + EXPECT_EQ(result->type()->type_id(), param.source_type->type_id()) + << "Expected void transform to return same type as source for: " + << param.source.ToString(); + EXPECT_EQ(result->ToString(), param.expected.ToString()) + << "Unexpected result for source: " << param.source.ToString(); } -TEST(TransformLiteralTest, NullLiteral) { - struct Case { - std::string str; - std::shared_ptr source_type; - Literal source; - std::shared_ptr expected_result_type; - }; - - const std::vector cases = { - {.str = "identity", - .source_type = iceberg::string(), - .source = Literal::Null(iceberg::string()), - .expected_result_type = iceberg::string()}, - {.str = "year", - .source_type = iceberg::timestamp(), - .source = Literal::Null(iceberg::timestamp()), - .expected_result_type = iceberg::int32()}, - {.str = "month", - .source_type = iceberg::timestamp(), - .source = Literal::Null(iceberg::timestamp()), - .expected_result_type = iceberg::int32()}, - {.str = "day", - .source_type = iceberg::timestamp(), - .source = Literal::Null(iceberg::timestamp()), - .expected_result_type = iceberg::int32()}, - {.str = "hour", - .source_type = iceberg::timestamp(), - .source = Literal::Null(iceberg::timestamp()), - .expected_result_type = iceberg::int32()}, - {.str = "void", - .source_type = iceberg::string(), - .source = Literal::Null(iceberg::string()), - .expected_result_type = iceberg::string()}, - {.str = "bucket[16]", - .source_type = iceberg::string(), - .source = Literal::Null(iceberg::string()), - .expected_result_type = iceberg::int32()}, - {.str = "truncate[32]", - .source_type = iceberg::string(), - .source = Literal::Null(iceberg::string()), - .expected_result_type = iceberg::string()}, - }; - - for (const auto& c : cases) { - auto result = TransformFromString(c.str); - ASSERT_TRUE(result.has_value()) << "Failed to parse: " << c.str; - - const auto& transform = result.value(); - const auto transformPtr = transform->Bind(c.source_type); - ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << c.str; - - auto transform_result = transformPtr.value()->Transform(c.source); - EXPECT_TRUE(transform_result->IsNull()) - << "Expected void transform to return null type for source: " - << c.source.ToString(); - EXPECT_EQ(transform_result->type()->type_id(), c.expected_result_type->type_id()) - << "Expected void transform to return same type as source for: " - << c.source.ToString(); - } +INSTANTIATE_TEST_SUITE_P( + VoidTransformTests, VoidTransformTest, + ::testing::Values( + TransformParam{.str = "Boolean", + .source_type = iceberg::boolean(), + .source = Literal::Boolean(true), + .expected = Literal::Null(iceberg::boolean())}, + TransformParam{.str = "Int32", + .source_type = iceberg::int32(), + .source = Literal::Int(42), + .expected = Literal::Null(iceberg::int32())}, + TransformParam{.str = "Date", + .source_type = iceberg::date(), + .source = Literal::Date(30000), + .expected = Literal::Null(iceberg::date())}, + TransformParam{.str = "Int64", + .source_type = iceberg::int64(), + .source = Literal::Long(1234567890), + .expected = Literal::Null(iceberg::int64())}, + TransformParam{.str = "Timestamp", + .source_type = iceberg::timestamp(), + .source = Literal::Timestamp(1622547800000000), + .expected = Literal::Null(iceberg::timestamp())}, + TransformParam{.str = "TimestampTz", + .source_type = iceberg::timestamp_tz(), + .source = Literal::TimestampTz(1622547800000000), + .expected = Literal::Null(iceberg::timestamp_tz())}, + TransformParam{.str = "Float", + .source_type = iceberg::float32(), + .source = Literal::Float(3.14), + .expected = Literal::Null(iceberg::float32())}, + TransformParam{.str = "Double", + .source_type = iceberg::float64(), + .source = Literal::Double(1.23e-5), + .expected = Literal::Null(iceberg::float64())}, + TransformParam{.str = "Decimal", + .source_type = iceberg::decimal(10, 2), + .source = Literal::Decimal(123456, 10, 2), + .expected = Literal::Null(iceberg::decimal(10, 2))}, + TransformParam{.str = "String", + .source_type = iceberg::string(), + .source = Literal::String("Hello, World!"), + .expected = Literal::Null(iceberg::string())}, + TransformParam{ + .str = "Uuid", + .source_type = iceberg::uuid(), + .source = Literal::UUID( + Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value()), + .expected = Literal::Null(iceberg::uuid())}, + TransformParam{.str = "Binary", + .source_type = iceberg::binary(), + .source = Literal::Binary({0x01, 0x02, 0x03}), + .expected = Literal::Null(iceberg::binary())}, + TransformParam{.str = "Fixed", + .source_type = iceberg::fixed(3), + .source = Literal::Fixed({0x01, 0x02, 0x03}), + .expected = Literal::Null(iceberg::fixed(3))}), + [](const ::testing::TestParamInfo& info) { return info.param.str; }); + +class NullLiteralTransformTest : public ::testing::TestWithParam {}; + +TEST_P(NullLiteralTransformTest, NullLiteralTransform) { + const auto& param = GetParam(); + + auto result = TransformFromString(param.str); + ASSERT_TRUE(result.has_value()) << "Failed to parse: " << param.str; + + const auto& transform = result.value(); + const auto transformPtr = transform->Bind(param.source_type); + ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind: " << param.str; + + auto transform_result = transformPtr.value()->Transform(param.source); + EXPECT_TRUE(transform_result->IsNull()) + << "Expected transform to return null type for source: " << param.source.ToString(); + EXPECT_EQ(transform_result->ToString(), param.expected.ToString()) + << "Unexpected result for source: " << param.source.ToString(); } +INSTANTIATE_TEST_SUITE_P( + NullLiteralTransformTests, NullLiteralTransformTest, + ::testing::Values(TransformParam{.str = "identity", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected = Literal::Null(iceberg::string())}, + TransformParam{.str = "year", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected = Literal::Null(iceberg::int32())}, + TransformParam{.str = "month", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected = Literal::Null(iceberg::int32())}, + TransformParam{.str = "day", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected = Literal::Null(iceberg::int32())}, + TransformParam{.str = "hour", + .source_type = iceberg::timestamp(), + .source = Literal::Null(iceberg::timestamp()), + .expected = Literal::Null(iceberg::int32())}, + TransformParam{.str = "void", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected = Literal::Null(iceberg::string())}, + TransformParam{.str = "bucket[16]", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected = Literal::Null(iceberg::int32())}, + TransformParam{.str = "truncate[32]", + .source_type = iceberg::string(), + .source = Literal::Null(iceberg::string()), + .expected = Literal::Null(iceberg::string())})); + } // namespace iceberg diff --git a/src/iceberg/test/truncate_util_test.cc b/src/iceberg/test/truncate_util_test.cc new file mode 100644 index 000000000..61010fcd8 --- /dev/null +++ b/src/iceberg/test/truncate_util_test.cc @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/truncate_util.h" + +#include + +#include "iceberg/expression/literal.h" + +namespace iceberg { + +// The following tests are from +// https://iceberg.apache.org/spec/#truncate-transform-details +TEST(TruncateUtilTest, TruncateLiteral) { + // Integer + EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Int(1), 10), Literal::Int(0)); + EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Int(-1), 10), Literal::Int(-10)); + EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Long(1), 10), Literal::Long(0)); + EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Long(-1), 10), Literal::Long(-10)); + + // Decimal + EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::Decimal(1065, 4, 2), 50), + Literal::Decimal(1050, 4, 2)); + + // String + EXPECT_EQ(TruncateUtils::TruncateLiteral(Literal::String("iceberg"), 3), + Literal::String("ice")); + + // Binary + std::string data = "\x01\x02\x03\x04\x05"; + std::string expected = "\x01\x02\x03"; + EXPECT_EQ(TruncateUtils::TruncateLiteral( + Literal::Binary(std::vector(data.begin(), data.end())), 3), + Literal::Binary(std::vector(expected.begin(), expected.end()))); +} + +} // namespace iceberg diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index fd9a1659b..e2f5ecec9 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -20,16 +20,14 @@ #include "iceberg/transform_function.h" #include -#include -#include -#include -#include #include "iceberg/expression/literal.h" #include "iceberg/type.h" -#include "iceberg/util/murmurhash3_internal.h" +#include "iceberg/type_fwd.h" +#include "iceberg/util/bucket_util.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/temporal_util.h" #include "iceberg/util/truncate_util.h" -#include "iceberg/util/uuid.h" namespace iceberg { @@ -54,48 +52,14 @@ BucketTransform::BucketTransform(std::shared_ptr const& source_type, : TransformFunction(TransformType::kBucket, source_type), num_buckets_(num_buckets) {} Result BucketTransform::Transform(const Literal& literal) { - assert(literal.type() == source_type()); - if (literal.IsBelowMin() || literal.IsAboveMax()) { - return InvalidArgument( - "Cannot apply bucket transform to literal with value {} of type {}", - literal.ToString(), source_type()->ToString()); - } + ICEBERG_DCHECK(*literal.type() == *source_type(), + "Literal type must match source type"); if (literal.IsNull()) [[unlikely]] { return Literal::Null(int32()); } - int32_t hash_value = 0; - std::visit( - [&](auto&& value) { - using T = std::decay_t; - if constexpr (std::is_same_v) { - MurmurHash3_x86_32(&value, sizeof(int32_t), 0, &hash_value); - } else if constexpr (std::is_same_v) { - MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value); - } else if constexpr (std::is_same_v>) { - MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0, &hash_value); - } else if constexpr (std::is_same_v) { - MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); - } else if constexpr (std::is_same_v) { - MurmurHash3_x86_32(std::get(literal.value()).bytes().data(), - Uuid::kLength, 0, &hash_value); - } else if constexpr (std::is_same_v>) { - MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value); - } else if constexpr (std::is_same_v || - std::is_same_v || std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) { - std::unreachable(); - } else { - static_assert(false, "Unhandled type in BucketTransform::Transform"); - } - }, - literal.value()); - - // Calculate the bucket index - int32_t bucket_index = - (hash_value & std::numeric_limits::max()) % num_buckets_; + ICEBERG_ASSIGN_OR_RAISE(auto bucket_index, + BucketUtils::BucketIndex(literal, num_buckets_)) return Literal::Int(bucket_index); } @@ -135,47 +99,9 @@ TruncateTransform::TruncateTransform(std::shared_ptr const& source_type, : TransformFunction(TransformType::kTruncate, source_type), width_(width) {} Result TruncateTransform::Transform(const Literal& literal) { - assert(literal.type() == source_type()); - if (literal.IsBelowMin() || literal.IsAboveMax()) { - return InvalidArgument( - "Cannot apply truncate transform to literal with value {} of type {}", - literal.ToString(), source_type()->ToString()); - } - if (literal.IsNull()) [[unlikely]] { - // Return null as is - return literal; - } - - switch (source_type()->type_id()) { - case TypeId::kInt: { - auto value = std::get(literal.value()); - return Literal::Int(TruncateUtils::TruncateInteger(value, width_)); - } - case TypeId::kLong: { - auto value = std::get(literal.value()); - return Literal::Long(TruncateUtils::TruncateInteger(value, width_)); - } - case TypeId::kDecimal: { - // TODO(zhjwpku): Handle decimal truncation logic here - return NotImplemented("Truncate for Decimal is not implemented yet"); - } - case TypeId::kString: { - // Strings are truncated to a valid UTF-8 string with no more than L code points. - auto value = std::get(literal.value()); - return Literal::String(TruncateUtils::TruncateUTF8(std::move(value), width_)); - } - case TypeId::kBinary: { - /// In contrast to strings, binary values do not have an assumed encoding and are - /// truncated to L bytes. - auto value = std::get>(literal.value()); - if (value.size() > static_cast(width_)) { - value.resize(width_); - } - return Literal::Binary(std::move(value)); - } - default: - std::unreachable(); - } + ICEBERG_DCHECK(*literal.type() == *source_type(), + "Literal type must match source type"); + return TruncateUtils::TruncateLiteral(literal, width_); } std::shared_ptr TruncateTransform::ResultType() const { return source_type(); } @@ -206,34 +132,9 @@ YearTransform::YearTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kTruncate, source_type) {} Result YearTransform::Transform(const Literal& literal) { - assert(literal.type() == source_type()); - if (literal.IsBelowMin() || literal.IsAboveMax()) { - return InvalidArgument( - "Cannot apply year transform to literal with value {} of type {}", - literal.ToString(), source_type()->ToString()); - } - if (literal.IsNull()) [[unlikely]] { - return Literal::Null(int32()); - } - - using namespace std::chrono; // NOLINT - switch (source_type()->type_id()) { - case TypeId::kDate: { - auto value = std::get(literal.value()); - auto epoch = sys_days(year{1970} / January / 1); - auto ymd = year_month_day(epoch + days{value}); - return Literal::Int(static_cast(ymd.year())); - } - case TypeId::kTimestamp: - case TypeId::kTimestampTz: { - auto value = std::get(literal.value()); - // Convert microseconds-since-epoch into a `year_month_day` object - auto ymd = year_month_day(floor(sys_time(microseconds{value}))); - return Literal::Int(static_cast(ymd.year())); - } - default: - std::unreachable(); - } + ICEBERG_DCHECK(*literal.type() == *source_type(), + "Literal type must match source type"); + return TemporalUtils::ExtractYear(literal); } std::shared_ptr YearTransform::ResultType() const { return int32(); } @@ -259,46 +160,9 @@ MonthTransform::MonthTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kMonth, source_type) {} Result MonthTransform::Transform(const Literal& literal) { - assert(literal.type() == source_type()); - if (literal.IsBelowMin() || literal.IsAboveMax()) { - return InvalidArgument( - "Cannot apply month transform to literal with value {} of type {}", - literal.ToString(), source_type()->ToString()); - } - if (literal.IsNull()) [[unlikely]] { - return Literal::Null(int32()); - } - - using namespace std::chrono; // NOLINT - switch (source_type()->type_id()) { - case TypeId::kDate: { - auto value = std::get(literal.value()); - auto epoch = sys_days(year{1970} / January / 1); - auto ymd = year_month_day(epoch + days{value}); - auto epoch_ymd = year_month_day(epoch); - auto delta = ymd.year() - epoch_ymd.year(); - // Calculate the month as months from 1970-01 - // Note: January is month 1, so we subtract 1 to get zero-based - // month count. - return Literal::Int(static_cast(delta.count() * 12 + - static_cast(ymd.month()) - 1)); - } - case TypeId::kTimestamp: - case TypeId::kTimestampTz: { - auto value = std::get(literal.value()); - // Convert microseconds-since-epoch into a `year_month_day` object - auto ymd = year_month_day(floor(sys_time(microseconds{value}))); - auto epoch_ymd = year_month_day(year{1970} / January / 1); - auto delta = ymd.year() - epoch_ymd.year(); - // Calculate the month as months from 1970-01 - // Note: January is month 1, so we subtract 1 to get zero-based - // month count. - return Literal::Int(static_cast(delta.count() * 12 + - static_cast(ymd.month()) - 1)); - } - default: - std::unreachable(); - } + ICEBERG_DCHECK(*literal.type() == *source_type(), + "Literal type must match source type"); + return TemporalUtils::ExtractMonth(literal); } std::shared_ptr MonthTransform::ResultType() const { return int32(); } @@ -324,34 +188,9 @@ DayTransform::DayTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kDay, source_type) {} Result DayTransform::Transform(const Literal& literal) { - assert(literal.type() == source_type()); - if (literal.IsBelowMin() || literal.IsAboveMax()) { - return InvalidArgument( - "Cannot apply day transform to literal with value {} of type {}", - literal.ToString(), source_type()->ToString()); - } - if (literal.IsNull()) [[unlikely]] { - return Literal::Null(int32()); - } - - using namespace std::chrono; // NOLINT - switch (source_type()->type_id()) { - case TypeId::kDate: { - return Literal::Int(std::get(literal.value())); - } - case TypeId::kTimestamp: - case TypeId::kTimestampTz: { - auto value = std::get(literal.value()); - // Convert microseconds to `sys_days` (chronological days since epoch) - auto timestamp = sys_time(microseconds{value}); - auto days_since_epoch = floor(timestamp); - - return Literal::Int( - static_cast(days_since_epoch.time_since_epoch().count())); - } - default: - std::unreachable(); - } + ICEBERG_DCHECK(*literal.type() == *source_type(), + "Literal type must match source type"); + return TemporalUtils::ExtractDay(literal); } std::shared_ptr DayTransform::ResultType() const { return int32(); } @@ -377,33 +216,9 @@ HourTransform::HourTransform(std::shared_ptr const& source_type) : TransformFunction(TransformType::kHour, source_type) {} Result HourTransform::Transform(const Literal& literal) { - assert(literal.type() == source_type()); - if (literal.IsBelowMin() || literal.IsAboveMax()) { - return InvalidArgument( - "Cannot apply hour transform to literal with value {} of type {}", - literal.ToString(), source_type()->ToString()); - } - - if (literal.IsNull()) [[unlikely]] { - return Literal::Null(int32()); - } - - using namespace std::chrono; // NOLINT - switch (source_type()->type_id()) { - case TypeId::kTimestamp: - case TypeId::kTimestampTz: { - auto value = std::get(literal.value()); - // Create a `sys_time` object from the microseconds value - auto timestamp = sys_time(microseconds{value}); - - // Convert the time since epoch directly into hours - auto hours_since_epoch = duration_cast(timestamp.time_since_epoch()).count(); - - return Literal::Int(static_cast(hours_since_epoch)); - } - default: - std::unreachable(); - } + ICEBERG_DCHECK(*literal.type() == *source_type(), + "Literal type must match source type"); + return TemporalUtils::ExtractHour(literal); } std::shared_ptr HourTransform::ResultType() const { return int32(); } diff --git a/src/iceberg/transform_function.h b/src/iceberg/transform_function.h index 165390b11..fc0dd7231 100644 --- a/src/iceberg/transform_function.h +++ b/src/iceberg/transform_function.h @@ -51,6 +51,9 @@ class ICEBERG_EXPORT BucketTransform : public TransformFunction { BucketTransform(std::shared_ptr const& source_type, int32_t num_buckets); /// \brief Applies the bucket hash function to the input Literal. + /// + /// Reference: + /// - https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements Result Transform(const Literal& literal) override; /// \brief Returns INT32 as the output type. diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 3bd067d0c..436744811 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -115,6 +115,9 @@ class NameMapping; enum class SnapshotRefType; enum class TransformType; +class Decimal; +class Uuid; + class Expression; class Literal; diff --git a/src/iceberg/util/bucket_util.cc b/src/iceberg/util/bucket_util.cc new file mode 100644 index 000000000..88b240de7 --- /dev/null +++ b/src/iceberg/util/bucket_util.cc @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/bucket_util.h" + +#include + +#include "iceberg/expression/literal.h" +#include "iceberg/util/endian.h" +#include "iceberg/util/murmurhash3_internal.h" + +namespace iceberg { + +namespace { +template +int32_t HashLiteral(const Literal& literal) { + std::unreachable(); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + return BucketUtils::HashInt(std::get(literal.value())); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + return BucketUtils::HashInt(std::get(literal.value())); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + return BucketUtils::HashLong(std::get(literal.value())); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + return BucketUtils::HashLong(std::get(literal.value())); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + return BucketUtils::HashLong(std::get(literal.value())); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + return BucketUtils::HashLong(std::get(literal.value())); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + const auto& decimal = std::get(literal.value()); + return BucketUtils::HashBytes(decimal.ToBigEndian()); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + const auto& str = std::get(literal.value()); + return BucketUtils::HashBytes( + std::span(reinterpret_cast(str.data()), str.size())); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + const auto& uuid = std::get(literal.value()); + return BucketUtils::HashBytes(uuid.bytes()); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + const auto& binary = std::get>(literal.value()); + return BucketUtils::HashBytes(binary); +} + +template <> +int32_t HashLiteral(const Literal& literal) { + const auto& fixed = std::get>(literal.value()); + return BucketUtils::HashBytes(fixed); +} + +} // namespace + +int32_t BucketUtils::HashBytes(std::span bytes) { + int32_t hash_value = 0; + MurmurHash3_x86_32(bytes.data(), bytes.size(), 0, &hash_value); + return hash_value; +} + +int32_t BucketUtils::HashLong(int64_t value) { + int32_t hash_value = 0; + value = ToLittleEndian(value); + MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value); + return hash_value; +} + +#define DISPATCH_HASH_LITERAL(TYPE_ID) \ + case TYPE_ID: \ + hash_value = HashLiteral(literal); \ + break; + +Result BucketUtils::BucketIndex(const Literal& literal, int32_t num_buckets) { + if (num_buckets <= 0) [[unlikely]] { + return InvalidArgument("Number of buckets must be positive, got {}", num_buckets); + } + + if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] { + return NotSupported("Cannot compute bucket index for {}", literal.ToString()); + } + + int32_t hash_value = 0; + switch (literal.type()->type_id()) { + DISPATCH_HASH_LITERAL(TypeId::kInt) + DISPATCH_HASH_LITERAL(TypeId::kDate) + DISPATCH_HASH_LITERAL(TypeId::kLong) + DISPATCH_HASH_LITERAL(TypeId::kTime) + DISPATCH_HASH_LITERAL(TypeId::kTimestamp) + DISPATCH_HASH_LITERAL(TypeId::kTimestampTz) + DISPATCH_HASH_LITERAL(TypeId::kDecimal) + DISPATCH_HASH_LITERAL(TypeId::kString) + DISPATCH_HASH_LITERAL(TypeId::kUuid) + DISPATCH_HASH_LITERAL(TypeId::kBinary) + DISPATCH_HASH_LITERAL(TypeId::kFixed) + default: + return NotSupported("Hashing not supported for type {}", + literal.type()->ToString()); + } + + return (hash_value & std::numeric_limits::max()) % num_buckets; +} + +} // namespace iceberg diff --git a/src/iceberg/util/bucket_util.h b/src/iceberg/util/bucket_util.h new file mode 100644 index 000000000..31a574b48 --- /dev/null +++ b/src/iceberg/util/bucket_util.h @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +class ICEBERG_EXPORT BucketUtils { + public: + /// \brief Hash a 32-bit integer using MurmurHash3 and return a 32-bit hash value. + /// \param value The input integer to hash. + /// \note Integer and long hash results must be identical for all integer values. This + /// ensures that schema evolution does not change bucket partition values if integer + /// types are promoted. + /// \return A 32-bit hash value. + static inline int32_t HashInt(int32_t value) { + return HashLong(static_cast(value)); + } + + /// \brief Hash a 64-bit integer using MurmurHash3 and return a 32-bit hash value. + /// \param value The input long to hash. + /// \return A 32-bit hash value. + static int32_t HashLong(int64_t value); + + /// \brief Hash a byte array using MurmurHash3 and return a 32-bit hash value. + /// \param bytes The input byte array to hash. + /// \return A 32-bit hash value. + static int32_t HashBytes(std::span bytes); + + /// \brief Compute the bucket index for a given literal and number of buckets. + /// \param literal The input literal to hash. + /// \param num_buckets The number of buckets to hash into. + /// \return (murmur3_x86_32_hash(literal) & Integer.MAX_VALUE) % num_buckets + static Result BucketIndex(const Literal& literal, int32_t num_buckets); +}; + +} // namespace iceberg diff --git a/src/iceberg/util/conversions.cc b/src/iceberg/util/conversions.cc index e12e4815d..0cc7c55d8 100644 --- a/src/iceberg/util/conversions.cc +++ b/src/iceberg/util/conversions.cc @@ -23,6 +23,7 @@ #include #include +#include "iceberg/util/decimal.h" #include "iceberg/util/endian.h" #include "iceberg/util/macros.h" #include "iceberg/util/uuid.h" @@ -64,6 +65,12 @@ Result> ToBytesImpl(const Literal::Value& : static_cast(0x00)}; } +template <> +Result> ToBytesImpl(const Literal::Value& value) { + const auto& decimal = std::get(value); + return decimal.ToBigEndian(); +} + template <> Result> ToBytesImpl(const Literal::Value& value) { const auto& str = std::get(value); @@ -95,6 +102,7 @@ Result> Conversions::ToBytes(const PrimitiveType& type, const auto type_id = type.type_id(); switch (type_id) { + DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean) DISPATCH_LITERAL_TO_BYTES(TypeId::kInt) DISPATCH_LITERAL_TO_BYTES(TypeId::kDate) DISPATCH_LITERAL_TO_BYTES(TypeId::kLong) @@ -103,12 +111,11 @@ Result> Conversions::ToBytes(const PrimitiveType& type, DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestampTz) DISPATCH_LITERAL_TO_BYTES(TypeId::kFloat) DISPATCH_LITERAL_TO_BYTES(TypeId::kDouble) - DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean) + DISPATCH_LITERAL_TO_BYTES(TypeId::kDecimal) DISPATCH_LITERAL_TO_BYTES(TypeId::kString) DISPATCH_LITERAL_TO_BYTES(TypeId::kUuid) DISPATCH_LITERAL_TO_BYTES(TypeId::kBinary) DISPATCH_LITERAL_TO_BYTES(TypeId::kFixed) - // TODO(Li Feiyang): Add support for Decimal default: return NotSupported("Serialization for type {} is not supported", type.ToString()); @@ -177,6 +184,11 @@ Result Conversions::FromBytes(const PrimitiveType& type, return Literal::Value{double_value}; } } + case TypeId::kDecimal: { + ICEBERG_ASSIGN_OR_RAISE(auto decimal, + Decimal::FromBigEndian(data.data(), data.size())); + return Literal::Value{decimal}; + } case TypeId::kString: return Literal::Value{ std::string(reinterpret_cast(data.data()), data.size())}; @@ -194,7 +206,6 @@ Result Conversions::FromBytes(const PrimitiveType& type, } return Literal::Value{std::vector(data.begin(), data.end())}; } - // TODO(Li Feiyang): Add support for Decimal default: return NotSupported("Deserialization for type {} is not supported", type.ToString()); diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index 5018574a7..f33d93287 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -24,12 +24,12 @@ #include "iceberg/util/decimal.h" +#include #include #include #include #include #include -#include #include #include #include @@ -44,6 +44,16 @@ namespace iceberg { namespace { +constexpr int32_t kMinDecimalBytes = 1; +constexpr int32_t kMaxDecimalBytes = 16; + +// The maximum decimal value that can be represented with kMaxPrecision digits. +// 10^38 - 1 +constexpr Decimal kMaxDecimalValue(5421010862427522170LL, 687399551400673279ULL); +// The mininum decimal value that can be represented with kMaxPrecision digits. +// - (10^38 - 1) +constexpr Decimal kMinDecimalValue(-5421010862427522171LL, 17759344522308878337ULL); + struct DecimalComponents { std::string_view while_digits; std::string_view fractional_digits; @@ -275,8 +285,15 @@ bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, return res->second != 0; } + auto max_safe_value = kMaxDecimalValue / multiplier; + auto min_safe_value = kMinDecimalValue / multiplier; + if (value > max_safe_value || value < min_safe_value) { + // Overflow would happen — treat as data loss + return true; + } + *result = value * multiplier; - return (value < 0) ? *result > value : *result < value; + return false; } } // namespace @@ -470,11 +487,6 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, } Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { - static constexpr int32_t kMinDecimalBytes = 1; - static constexpr int32_t kMaxDecimalBytes = 16; - - int64_t high, low; - if (length < kMinDecimalBytes || length > kMaxDecimalBytes) { return InvalidArgument( "Decimal::FromBigEndian: length must be in the range [{}, {}], was {}", @@ -486,7 +498,8 @@ Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { const bool is_negative = static_cast(bytes[0]) < 0; uint128_t result = 0; - std::memcpy(reinterpret_cast(&result) + 16 - length, bytes, length); + std::memcpy(reinterpret_cast(&result) + kMaxDecimalBytes - length, bytes, + length); if constexpr (std::endian::native == std::endian::little) { auto high = static_cast(result >> 64); @@ -505,6 +518,36 @@ Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { return Decimal(static_cast(result)); } +std::vector Decimal::ToBigEndian() const { + std::vector bytes(kMaxDecimalBytes); + + auto uvalue = static_cast(data_); + std::memcpy(bytes.data(), &uvalue, kMaxDecimalBytes); + + if constexpr (std::endian::native == std::endian::little) { + std::ranges::reverse(bytes); + } + + auto is_negative = data_ < 0; + int keep = kMaxDecimalBytes; + for (int32_t i = 0; i < kMaxDecimalBytes - 1; ++i) { + uint8_t byte = bytes[i]; + uint8_t next = bytes[i + 1]; + // For negative numbers, keep the leading 0xff byte if the next byte has its sign bit + // unset. For positive numbers, keep the leading 0x00 byte if the next byte has its + // sign bit set. + if ((is_negative && byte == 0xff && (next & 0x80)) || + (!is_negative && byte == 0x00 && !(next & 0x80))) { + --keep; + } else { + break; + } + } + + bytes.erase(bytes.begin(), bytes.begin() + (kMaxDecimalBytes - keep)); + return bytes; +} + Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { if (orig_scale == new_scale) { return *this; @@ -518,10 +561,7 @@ Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { auto& multiplier = kDecimal128PowersOfTen[abs_delta_scale]; - const bool rescale_would_cause_data_loss = - RescaleWouldCauseDataLoss(*this, delta_scale, multiplier, &out); - - if (rescale_would_cause_data_loss) { + if (RescaleWouldCauseDataLoss(*this, delta_scale, multiplier, &out)) [[unlikely]] { return Invalid("Rescale {} from {} to {} would cause data loss", ToIntegerString(), orig_scale, new_scale); } @@ -534,6 +574,52 @@ bool Decimal::FitsInPrecision(int32_t precision) const { return Decimal::Abs(*this) < kDecimal128PowersOfTen[precision]; } +std::partial_ordering Decimal::Compare(const Decimal& lhs, const Decimal& rhs, + int32_t lhs_scale, int32_t rhs_scale) { + if (lhs_scale == rhs_scale || lhs.data_ == 0 || rhs.data_ == 0) { + return lhs <=> rhs; + } + + // If one is negative and the other is positive, the positive is greater. + if (lhs.data_ < 0 && rhs.data_ > 0) { + return std::partial_ordering::less; + } + if (lhs.data_ > 0 && rhs.data_ < 0) { + return std::partial_ordering::greater; + } + + // Both are negative + bool negative = lhs.data_ < 0 && rhs.data_ < 0; + + const int32_t delta_scale = lhs_scale - rhs_scale; + const int32_t abs_delta_scale = std::abs(delta_scale); + + ICEBERG_DCHECK(abs_delta_scale <= kMaxScale, ""); + + const auto& multiplier = kDecimal128PowersOfTen[abs_delta_scale]; + + Decimal adjusted_lhs; + Decimal adjusted_rhs; + + if (delta_scale < 0) { + // lhs_scale < rhs_scale + if (RescaleWouldCauseDataLoss(lhs, -delta_scale, multiplier, &adjusted_lhs)) + [[unlikely]] { + return negative ? std::partial_ordering::less : std::partial_ordering::greater; + } + adjusted_rhs = rhs; + } else { + // lhs_scale > rhs_scale + if (RescaleWouldCauseDataLoss(rhs, delta_scale, multiplier, &adjusted_rhs)) + [[unlikely]] { + return negative ? std::partial_ordering::greater : std::partial_ordering::less; + } + adjusted_lhs = lhs; + } + + return adjusted_lhs <=> adjusted_rhs; +} + std::array Decimal::ToBytes() const { std::array out{{0}}; std::memcpy(out.data(), &data_, kByteWidth); diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h index 7e9cd7ce4..b7f57f47c 100644 --- a/src/iceberg/util/decimal.h +++ b/src/iceberg/util/decimal.h @@ -25,11 +25,13 @@ /// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h #include +#include #include #include #include #include #include +#include #include "iceberg/iceberg_export.h" #include "iceberg/result.h" @@ -142,7 +144,7 @@ class ICEBERG_EXPORT Decimal : public util::Formattable { /// \brief Convert the Decimal value to a base 10 decimal string with the given scale. /// \param scale The scale to use for the string representation. /// \return The string representation of the Decimal value. - Result ToString(int32_t scale = 0) const; + Result ToString(int32_t scale) const; /// \brief Convert the Decimal value to an integer string. std::string ToIntegerString() const; @@ -164,6 +166,11 @@ class ICEBERG_EXPORT Decimal : public util::Formattable { /// \return error status if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); + /// \brief Convert Decimal's unscaled value to two’s-complement big-endian binary, using + /// the minimum number of bytes for the value. + /// \return A vector containing the big-endian bytes. + std::vector ToBigEndian() const; + /// \brief Convert Decimal from one scale to another. Result Rescale(int32_t orig_scale, int32_t new_scale) const; @@ -180,6 +187,10 @@ class ICEBERG_EXPORT Decimal : public util::Formattable { return low() <=> other.low(); } + /// \brief Compare two Decimals with different scales. + static std::partial_ordering Compare(const Decimal& lhs, const Decimal& rhs, + int32_t lhs_scale, int32_t rhs_scale); + const uint8_t* native_endian_bytes() const { return reinterpret_cast(&data_); } diff --git a/src/iceberg/util/temporal_util.cc b/src/iceberg/util/temporal_util.cc new file mode 100644 index 000000000..41748c920 --- /dev/null +++ b/src/iceberg/util/temporal_util.cc @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/temporal_util.h" + +#include +#include + +#include "iceberg/expression/literal.h" + +namespace iceberg { + +namespace { + +using namespace std::chrono; // NOLINT + +constexpr auto kEpochYmd = year{1970} / January / 1; +constexpr auto kEpochDays = sys_days(kEpochYmd); + +inline constexpr year_month_day DateToYmd(int32_t days_since_epoch) { + return {kEpochDays + days{days_since_epoch}}; +} + +inline constexpr year_month_day TimestampToYmd(int64_t micros_since_epoch) { + return {floor(sys_time(microseconds{micros_since_epoch}))}; +} + +template + requires std::is_same_v || std::is_same_v +inline constexpr int32_t TimestampToDuration(int64_t micros_since_epoch) { + return static_cast( + floor( + sys_time(microseconds{micros_since_epoch}).time_since_epoch()) + .count()); +} + +inline constexpr int32_t MonthsSinceEpoch(const year_month_day& ymd) { + auto delta = ymd.year() - kEpochYmd.year(); + // Calculate the month as months from 1970-01 + // Note: January is month 1, so we subtract 1 to get zero-based month count. + return static_cast(delta.count() * 12 + static_cast(ymd.month()) - + 1); +} + +template +Result ExtractYearImpl(const Literal& literal) { + std::unreachable(); +} + +template <> +Result ExtractYearImpl(const Literal& literal) { + auto value = std::get(literal.value()); + auto ymd = DateToYmd(value); + return Literal::Int(static_cast(ymd.year())); +} + +template <> +Result ExtractYearImpl(const Literal& literal) { + auto value = std::get(literal.value()); + auto ymd = TimestampToYmd(value); + return Literal::Int(static_cast(ymd.year())); +} + +template <> +Result ExtractYearImpl(const Literal& literal) { + return ExtractYearImpl(literal); +} + +template +Result ExtractMonthImpl(const Literal& literal) { + std::unreachable(); +} + +template <> +Result ExtractMonthImpl(const Literal& literal) { + auto value = std::get(literal.value()); + auto ymd = DateToYmd(value); + return Literal::Int(MonthsSinceEpoch(ymd)); +} + +template <> +Result ExtractMonthImpl(const Literal& literal) { + auto value = std::get(literal.value()); + auto ymd = TimestampToYmd(value); + return Literal::Int(MonthsSinceEpoch(ymd)); +} + +template <> +Result ExtractMonthImpl(const Literal& literal) { + return ExtractMonthImpl(literal); +} + +template +Result ExtractDayImpl(const Literal& literal) { + std::unreachable(); +} + +template <> +Result ExtractDayImpl(const Literal& literal) { + return Literal::Int(std::get(literal.value())); +} + +template <> +Result ExtractDayImpl(const Literal& literal) { + auto value = std::get(literal.value()); + return Literal::Int(TimestampToDuration(value)); +} + +template <> +Result ExtractDayImpl(const Literal& literal) { + return ExtractDayImpl(literal); +} + +template +Result ExtractHourImpl(const Literal& literal) { + std::unreachable(); +} + +template <> +Result ExtractHourImpl(const Literal& literal) { + auto value = std::get(literal.value()); + return Literal::Int(TimestampToDuration(value)); +} + +template <> +Result ExtractHourImpl(const Literal& literal) { + return ExtractHourImpl(literal); +} + +} // namespace + +#define DISPATCH_EXTRACT_YEAR(type_id) \ + case type_id: \ + return ExtractYearImpl(literal); + +Result TemporalUtils::ExtractYear(const Literal& literal) { + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } + + if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] { + return NotSupported("Cannot extract year from {}", literal.ToString()); + } + + switch (literal.type()->type_id()) { + DISPATCH_EXTRACT_YEAR(TypeId::kDate) + DISPATCH_EXTRACT_YEAR(TypeId::kTimestamp) + DISPATCH_EXTRACT_YEAR(TypeId::kTimestampTz) + default: + return NotSupported("Extract year from type {} is not supported", + literal.type()->ToString()); + } +} + +#define DISPATCH_EXTRACT_MONTH(type_id) \ + case type_id: \ + return ExtractMonthImpl(literal); + +Result TemporalUtils::ExtractMonth(const Literal& literal) { + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } + + if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] { + return NotSupported("Cannot extract month from {}", literal.ToString()); + } + + switch (literal.type()->type_id()) { + DISPATCH_EXTRACT_MONTH(TypeId::kDate) + DISPATCH_EXTRACT_MONTH(TypeId::kTimestamp) + DISPATCH_EXTRACT_MONTH(TypeId::kTimestampTz) + default: + return NotSupported("Extract month from type {} is not supported", + literal.type()->ToString()); + } +} + +#define DISPATCH_EXTRACT_DAY(type_id) \ + case type_id: \ + return ExtractDayImpl(literal); + +Result TemporalUtils::ExtractDay(const Literal& literal) { + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } + + if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] { + return NotSupported("Cannot extract day from {}", literal.ToString()); + } + + switch (literal.type()->type_id()) { + DISPATCH_EXTRACT_DAY(TypeId::kDate) + DISPATCH_EXTRACT_DAY(TypeId::kTimestamp) + DISPATCH_EXTRACT_DAY(TypeId::kTimestampTz) + default: + return NotSupported("Extract day from type {} is not supported", + literal.type()->ToString()); + } +} + +#define DISPATCH_EXTRACT_HOUR(type_id) \ + case type_id: \ + return ExtractHourImpl(literal); + +Result TemporalUtils::ExtractHour(const Literal& literal) { + if (literal.IsNull()) [[unlikely]] { + return Literal::Null(int32()); + } + + if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] { + return NotSupported("Cannot extract hour from {}", literal.ToString()); + } + + switch (literal.type()->type_id()) { + DISPATCH_EXTRACT_HOUR(TypeId::kTimestamp) + DISPATCH_EXTRACT_HOUR(TypeId::kTimestampTz) + default: + return NotSupported("Extract hour from type {} is not supported", + literal.type()->ToString()); + } +} + +} // namespace iceberg diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h new file mode 100644 index 000000000..750c3d8b7 --- /dev/null +++ b/src/iceberg/util/temporal_util.h @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +class ICEBERG_EXPORT TemporalUtils { + public: + /// \brief Extract a date or timestamp year, as years from 1970 + static Result ExtractYear(const Literal& literal); + + /// \brief Extract a date or timestamp month, as months from 1970-01-01 + static Result ExtractMonth(const Literal& literal); + + /// \brief Extract a date or timestamp day, as days from 1970-01-01 + static Result ExtractDay(const Literal& literal); + + /// \brief Extract a timestamp hour, as hours from 1970-01-01 00:00:00 + static Result ExtractHour(const Literal& literal); +}; + +} // namespace iceberg diff --git a/src/iceberg/util/truncate_util.cc b/src/iceberg/util/truncate_util.cc new file mode 100644 index 000000000..9d0c6e7ea --- /dev/null +++ b/src/iceberg/util/truncate_util.cc @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/truncate_util.h" + +#include +#include +#include + +#include "iceberg/expression/literal.h" +#include "iceberg/util/checked_cast.h" + +namespace iceberg { + +namespace { +template +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + std::unreachable(); +} + +template <> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + int32_t v = std::get(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + int64_t v = std::get(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + const auto& decimal = std::get(literal.value()); + auto type = internal::checked_pointer_cast(literal.type()); + return Literal::Decimal(TruncateUtils::TruncateDecimal(decimal, width).value(), + type->precision(), type->scale()); +} + +template <> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + // Strings are truncated to a valid UTF-8 string with no more than `width` code points. + const auto& str = std::get(literal.value()); + return Literal::String(TruncateUtils::TruncateUTF8(str, width)); +} + +template <> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + // In contrast to strings, binary values do not have an assumed encoding and are + // truncated to `width` bytes. + const auto& data = std::get>(literal.value()); + if (data.size() <= width) { + return literal; + } + return Literal::Binary(std::vector(data.begin(), data.begin() + width)); +} + +} // namespace + +Decimal TruncateUtils::TruncateDecimal(const Decimal& decimal, int32_t width) { + return decimal - (((decimal % width) + width) % width); +} + +#define DISPATCH_TRUNCATE_LITERAL(TYPE_ID) \ + case TYPE_ID: \ + return TruncateLiteralImpl(literal, width); + +Result TruncateUtils::TruncateLiteral(const Literal& literal, int32_t width) { + if (literal.IsNull()) [[unlikely]] { + // Return null as is + return literal; + } + + if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] { + return NotSupported("Cannot truncate {}", literal.ToString()); + } + + switch (literal.type()->type_id()) { + DISPATCH_TRUNCATE_LITERAL(TypeId::kInt) + DISPATCH_TRUNCATE_LITERAL(TypeId::kLong) + DISPATCH_TRUNCATE_LITERAL(TypeId::kDecimal) + DISPATCH_TRUNCATE_LITERAL(TypeId::kString) + DISPATCH_TRUNCATE_LITERAL(TypeId::kBinary) + default: + return NotSupported("Truncate is not supported for type: {}", + literal.type()->ToString()); + } +} + +} // namespace iceberg diff --git a/src/iceberg/util/truncate_util.h b/src/iceberg/util/truncate_util.h index 5e76135c5..881c1d727 100644 --- a/src/iceberg/util/truncate_util.h +++ b/src/iceberg/util/truncate_util.h @@ -19,10 +19,13 @@ #pragma once +#include #include #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" namespace iceberg { @@ -64,9 +67,25 @@ class ICEBERG_EXPORT TruncateUtils { /// values, the correct truncate function is: v - (((v % W) + W) % W) template requires std::is_same_v || std::is_same_v - static inline T TruncateInteger(T v, size_t W) { + static inline T TruncateInteger(T v, int32_t W) { return v - (((v % W) + W) % W); } + + /// \brief Truncate a Decimal to a specified width. + /// \param decimal The input Decimal to truncate. + /// \param width The width to truncate to. + /// \return A Decimal truncated to the specified width. + static Decimal TruncateDecimal(const Decimal& decimal, int32_t width); + + /// \brief Truncate a Literal to a specified width. + /// \param literal The input Literal to truncate. + /// \param width The width to truncate to. + /// \return A Result containing the truncated Literal or an error. + /// Supported types are: INT, LONG, DECIMAL, STRING, BINARY. + /// Reference: + /// - [Truncate Transform + /// Details](https://iceberg.apache.org/spec/#truncate-transform-details) + static Result TruncateLiteral(const Literal& literal, int32_t width); }; } // namespace iceberg