feat: Literal support decimal & Literal serde

zhjwpku · zhjwpku · commit 7aa5b4e776ed · 2025-09-20T00:02:25.000+08:00
diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc
@@ -21,8 +21,13 @@
 
 #include <cmath>
 #include <concepts>
+#include <utility>
 
 #include "iceberg/exception.h"
+#include "iceberg/result.h"
+#include "iceberg/util/decimal.h"
+#include "iceberg/util/endian.h"
+#include "iceberg/util/macros.h"
 
 namespace iceberg {
 
@@ -149,13 +154,168 @@ Literal Literal::Binary(std::vector<uint8_t> value) {
   return {Value{std::move(value)}, binary()};
 }
 
+Literal Literal::Decimal(int128_t value, int32_t precision, int32_t scale) {
+  return {Value{value}, decimal(precision, scale)};
+}
+
+Result<Literal> Literal::Decimal(std::string_view value) {
+  int32_t precision = 0;
+  int32_t scale = 0;
+  ICEBERG_ASSIGN_OR_RAISE(auto decimal_value,
+                          Decimal::FromString(value, &precision, &scale));
+  return Literal{Value{decimal_value.value()}, decimal(precision, scale)};
+}
+
 Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
                                      std::shared_ptr<PrimitiveType> type) {
-  return NotImplemented("Deserialization of Literal is not implemented yet");
+  Literal::Value value;
+  switch (type->type_id()) {
+    case TypeId::kBoolean:
+      if (data.size() == 1 && data[0] == 1) {
+        value = true;
+      } else {
+        value = false;
+      }
+      break;
+    case TypeId::kInt:
+    case TypeId::kDate:
+      if (data.size() != sizeof(int32_t)) {
+        return Invalid("Invalid data size for Int literal deserialization");
+      }
+      value = FromLittleEndian(*reinterpret_cast<const int32_t*>(data.data()));
+      break;
+    case TypeId::kLong:
+      // In the case of an evolved field
+      if (data.size() == sizeof(int32_t)) {
+        value = static_cast<int64_t>(
+            FromLittleEndian(*reinterpret_cast<const int32_t*>(data.data())));
+      } else if (data.size() == sizeof(int64_t)) {
+        value = FromLittleEndian(*reinterpret_cast<const int64_t*>(data.data()));
+      } else {
+        return Invalid("Invalid data size for Long literal deserialization");
+      }
+      break;
+    case TypeId::kFloat:
+      if (data.size() != sizeof(float)) {
+        return Invalid("Invalid data size for Float literal deserialization");
+      }
+      value = FromLittleEndian(*reinterpret_cast<const float*>(data.data()));
+      break;
+    case TypeId::kDouble:
+      // In the case of an evolved field
+      if (data.size() == sizeof(float)) {
+        value = static_cast<double>(
+            FromLittleEndian(*reinterpret_cast<const float*>(data.data())));
+      } else if (data.size() == sizeof(double)) {
+        value = FromLittleEndian(*reinterpret_cast<const double*>(data.data()));
+      } else {
+        return Invalid("Invalid data size for Double literal deserialization");
+      }
+      break;
+    case TypeId::kTime:
+    case TypeId::kTimestamp:
+    case TypeId::kTimestampTz:
+      if (data.size() != sizeof(int64_t)) {
+        return Invalid("Invalid data size for Timestamp/Time literal deserialization");
+      }
+      value = FromLittleEndian(*reinterpret_cast<const int64_t*>(data.data()));
+      break;
+    case TypeId::kString:
+      value = std::string(data.begin(), data.end());
+      break;
+    case TypeId::kUuid:
+      if (data.size() != 16) {
+        return Invalid("Invalid data size for UUID literal deserialization");
+      }
+      value = *reinterpret_cast<const std::array<uint8_t, 16>*>(data.data());
+      break;
+    case TypeId::kDecimal: {
+      ICEBERG_ASSIGN_OR_RAISE(auto unscaled_decimal,
+                              Decimal::FromBigEndian(data.data(), data.size()));
+      value = unscaled_decimal.value();
+    } break;
+    case TypeId::kFixed:
+    case TypeId::kBinary:
+      value = std::vector<uint8_t>(data.begin(), data.end());
+      break;
+    default:
+      std::unreachable();
+  }
+
+  return Literal(value, std::move(type));
 }
 
 Result<std::vector<uint8_t>> Literal::Serialize() const {
-  return NotImplemented("Serialization of Literal is not implemented yet");
+  if (IsAboveMax() || IsBelowMin()) {
+    return Invalid("Cannot serialize AboveMax or BelowMin literal");
+  }
+  if (IsNull()) {
+    return std::vector<uint8_t>{};
+  }
+
+  switch (type_->type_id()) {
+    case TypeId::kBoolean: {
+      bool bool_val = std::get<bool>(value_);
+      return std::vector<uint8_t>{static_cast<uint8_t>(bool_val ? 1 : 0)};
+    }
+    case TypeId::kInt:
+    case TypeId::kDate: {
+      int32_t int_val = std::get<int32_t>(value_);
+      int32_t le_val = ToLittleEndian(int_val);
+      const auto* bytes =
+          reinterpret_cast<const uint8_t*>(static_cast<const void*>(&le_val));
+      return std::vector<uint8_t>(bytes, bytes + sizeof(int32_t));
+    }
+    case TypeId::kLong: {
+      int64_t long_val = std::get<int64_t>(value_);
+      int64_t le_val = ToLittleEndian(long_val);
+      const auto* bytes =
+          reinterpret_cast<const uint8_t*>(static_cast<const void*>(&le_val));
+      return std::vector<uint8_t>(bytes, bytes + sizeof(int64_t));
+    }
+    case TypeId::kFloat: {
+      float float_val = std::get<float>(value_);
+      float le_val = ToLittleEndian(float_val);
+      const auto* bytes =
+          reinterpret_cast<const uint8_t*>(static_cast<const void*>(&le_val));
+      return std::vector<uint8_t>(bytes, bytes + sizeof(float));
+    }
+    case TypeId::kDouble: {
+      double double_val = std::get<double>(value_);
+      double le_val = ToLittleEndian(double_val);
+      const auto* bytes =
+          reinterpret_cast<const uint8_t*>(static_cast<const void*>(&le_val));
+      return std::vector<uint8_t>(bytes, bytes + sizeof(double));
+    }
+    case TypeId::kTime:
+    case TypeId::kTimestamp:
+    case TypeId::kTimestampTz: {
+      int64_t time_val = std::get<int64_t>(value_);
+      int64_t le_val = ToLittleEndian(time_val);
+      const auto* bytes =
+          reinterpret_cast<const uint8_t*>(static_cast<const void*>(&le_val));
+      return std::vector<uint8_t>(bytes, bytes + sizeof(int64_t));
+    }
+    case TypeId::kString: {
+      const auto& str_val = std::get<std::string>(value_);
+      return std::vector<uint8_t>(str_val.begin(), str_val.end());
+    }
+    case TypeId::kUuid: {
+      const auto& uuid_val = std::get<std::array<uint8_t, 16>>(value_);
+      return std::vector<uint8_t>(uuid_val.begin(), uuid_val.end());
+    }
+    case TypeId::kDecimal: {
+      int128_t decimal_val = std::get<int128_t>(value_);
+      return Decimal::ToBigEndian(decimal_val);
+    }
+    case TypeId::kFixed:
+    case TypeId::kBinary: {
+      const auto& bin_val = std::get<std::vector<uint8_t>>(value_);
+      return bin_val;
+    }
+    default:
+      std::unreachable();
+  }
 }
 
 // Getters
@@ -249,6 +409,13 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const {
       return this_val <=> other_val;
     }
 
+    case TypeId::kDecimal: {
+      // TODO(zhjwpku): Handle precision/scale differences
+      auto this_val = std::get<int128_t>(value_);
+      auto other_val = std::get<int128_t>(other.value_);
+      return this_val <=> other_val;
+    }
+
     default:
       // For unsupported types, return unordered
       return std::partial_ordering::unordered;
diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h
@@ -22,11 +22,13 @@
 #include <compare>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <variant>
 #include <vector>
 
 #include "iceberg/result.h"
 #include "iceberg/type.h"
+#include "iceberg/util/int128.h"
 
 namespace iceberg {
 
@@ -56,7 +58,8 @@ class ICEBERG_EXPORT Literal {
                              double,          // for double
                              std::string,     // for string
                              std::vector<uint8_t>,     // for binary, fixed
-                             std::array<uint8_t, 16>,  // for uuid and decimal
+                             std::array<uint8_t, 16>,  // for uuid
+                             int128_t,                 // for decimal
                              BelowMin, AboveMax>;
 
   /// \brief Factory methods for primitive types
@@ -71,6 +74,8 @@ class ICEBERG_EXPORT Literal {
   static Literal Double(double value);
   static Literal String(std::string value);
   static Literal Binary(std::vector<uint8_t> value);
+  static Literal Decimal(int128_t value, int32_t precision, int32_t scale);
+  static Result<Literal> Decimal(std::string_view value);
 
   /// \brief Create a literal representing a null value.
   static Literal Null(std::shared_ptr<PrimitiveType> type) {
diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc
@@ -27,6 +27,7 @@
 
 #include "iceberg/expression/literal.h"
 #include "iceberg/type.h"
+#include "iceberg/util/int128.h"
 #include "iceberg/util/murmurhash3_internal.h"
 #include "iceberg/util/truncate_util.h"
 
@@ -73,6 +74,8 @@ Result<Literal> BucketTransform::Transform(const Literal& literal) {
           MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value);
         } else if constexpr (std::is_same_v<T, std::array<uint8_t, 16>>) {
           MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0, &hash_value);
+        } else if constexpr (std::is_same_v<T, int128_t>) {
+          MurmurHash3_x86_32(&value, sizeof(int128_t), 0, &hash_value);
         } else if constexpr (std::is_same_v<T, std::string>) {
           MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value);
         } else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc
@@ -24,6 +24,7 @@
 
 #include "iceberg/util/decimal.h"
 
+#include <algorithm>
 #include <bit>
 #include <charconv>
 #include <climits>
@@ -44,6 +45,9 @@ namespace iceberg {
 
 namespace {
 
+static constexpr int32_t kMinDecimalBytes = 1;
+static constexpr int32_t kMaxDecimalBytes = 16;
+
 struct DecimalComponents {
   std::string_view while_digits;
   std::string_view fractional_digits;
@@ -472,11 +476,6 @@ Result<Decimal> Decimal::FromString(std::string_view str, int32_t* precision,
 }
 
 Result<Decimal> Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) {
-  static constexpr int32_t kMinDecimalBytes = 1;
-  static constexpr int32_t kMaxDecimalBytes = 16;
-
-  int64_t high, low;
-
   if (length < kMinDecimalBytes || length > kMaxDecimalBytes) {
     return InvalidArgument(
         "Decimal::FromBigEndian: length must be in the range [{}, {}], was {}",
@@ -507,6 +506,36 @@ Result<Decimal> Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) {
   return Decimal(static_cast<int128_t>(result));
 }
 
+std::vector<uint8_t> Decimal::ToBigEndian(int128_t value) {
+  std::vector<uint8_t> bytes(kMaxDecimalBytes);
+
+  auto uvalue = static_cast<uint128_t>(value);
+  std::memcpy(bytes.data(), &uvalue, 16);
+
+  if constexpr (std::endian::native == std::endian::little) {
+    std::ranges::reverse(bytes);
+  }
+
+  auto is_negative = value < 0;
+  int keep = kMaxDecimalBytes;
+  for (int32_t i = 0; i < kMaxDecimalBytes - 1; ++i) {
+    uint8_t byte = bytes[i];
+    uint8_t next = bytes[i + 1];
+    // For negative numbers, keep the leading 0xff byte if the next byte has its sign bit
+    // unset. For positive numbers, keep the leading 0x00 byte if the next byte has its
+    // sign bit set.
+    if ((is_negative && byte == 0xff && (next & 0x80) == 0) ||
+        (!is_negative && byte == 0x00 && (next & 0x80) != 0)) {
+      --keep;
+    } else {
+      break;
+    }
+  }
+
+  bytes.erase(bytes.begin(), bytes.begin() + (kMaxDecimalBytes - keep));
+  return bytes;
+}
+
 Result<Decimal> Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const {
   if (orig_scale == new_scale) {
     return *this;
diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h
@@ -30,6 +30,7 @@
 #include <string>
 #include <string_view>
 #include <type_traits>
+#include <vector>
 
 #include "iceberg/iceberg_export.h"
 #include "iceberg/result.h"
@@ -164,6 +165,12 @@ class ICEBERG_EXPORT Decimal : public util::Formattable {
   /// \return error status if the length is an invalid value
   static Result<Decimal> FromBigEndian(const uint8_t* data, int32_t length);
 
+  /// \brief Convert Decimal's unscaled value to two’s-complement big-endian binary, using
+  ///        the minimum number of bytes for the value.
+  /// \param value The unscaled value.
+  /// \return A vector containing the big-endian bytes.
+  static std::vector<uint8_t> ToBigEndian(int128_t value);
+
   /// \brief Convert Decimal from one scale to another.
   Result<Decimal> Rescale(int32_t orig_scale, int32_t new_scale) const;
 
diff --git a/test/decimal_test.cc b/test/decimal_test.cc
@@ -490,6 +490,40 @@ TEST(DecimalTest, FromBigEndianInvalid) {
               IsError(ErrorKind::kInvalidArgument));
 }
 
+TEST(DecimalTest, ToBigEndian) {
+  std::vector<int64_t> high_values = {0,
+                                      1,
+                                      -1,
+                                      INT32_MAX,
+                                      INT32_MIN,
+                                      static_cast<int64_t>(INT32_MAX) + 1,
+                                      static_cast<int64_t>(INT32_MIN) - 1,
+                                      INT64_MAX,
+                                      INT64_MIN};
+  std::vector<uint64_t> low_values = {0,
+                                      1,
+                                      UINT32_MAX,
+                                      static_cast<uint64_t>(UINT32_MAX) + 1,
+                                      static_cast<uint64_t>(UINT32_MAX) + 2,
+                                      static_cast<uint64_t>(UINT32_MAX) + 3,
+                                      static_cast<uint64_t>(UINT32_MAX) + 4,
+                                      static_cast<uint64_t>(UINT32_MAX) + 5,
+                                      static_cast<uint64_t>(UINT32_MAX) + 6,
+                                      static_cast<uint64_t>(UINT32_MAX) + 7,
+                                      static_cast<uint64_t>(UINT32_MAX) + 8,
+                                      UINT64_MAX};
+
+  for (int64_t high : high_values) {
+    for (uint64_t low : low_values) {
+      Decimal value(high, low);
+      auto bytes = Decimal::ToBigEndian(value.value());
+      auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
+      ASSERT_THAT(result, IsOk());
+      EXPECT_EQ(result.value(), value);
+    }
+  }
+}
+
 TEST(DecimalTestFunctionality, Multiply) {
   ASSERT_EQ(Decimal(60501), Decimal(301) * Decimal(201));
   ASSERT_EQ(Decimal(-60501), Decimal(-301) * Decimal(201));
diff --git a/test/literal_test.cc b/test/literal_test.cc