From 5a15ef2a88073af4afc9f293c652763dfc92b719 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 17 Aug 2025 22:50:05 +0800 Subject: [PATCH 01/20] feat: add decimal value representation --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/expression/decimal.cc | 826 ++++++++++++++++++++++++++++++ src/iceberg/expression/decimal.h | 211 ++++++++ src/iceberg/result.h | 6 + src/iceberg/util/port.h | 45 ++ test/CMakeLists.txt | 6 +- test/decimal_test.cc | 153 ++++++ 7 files changed, 1247 insertions(+), 1 deletion(-) create mode 100644 src/iceberg/expression/decimal.cc create mode 100644 src/iceberg/expression/decimal.h create mode 100644 src/iceberg/util/port.h create mode 100644 test/decimal_test.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index ae282a3af..882c5a681 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -19,6 +19,7 @@ set(ICEBERG_INCLUDES "$" "$") set(ICEBERG_SOURCES catalog/in_memory_catalog.cc + expression/decimal.cc expression/expression.cc expression/literal.cc file_reader.cc diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc new file mode 100644 index 000000000..e5ac492ef --- /dev/null +++ b/src/iceberg/expression/decimal.cc @@ -0,0 +1,826 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/expression/decimal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/result.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { + +static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF; + +// Convenience wrapper type over 128 bit unsigned integers. This class merely provides the +// minimum necessary set of functions to perform 128 bit multiplication operations. +struct Uint128 { + Uint128() = default; + Uint128(uint64_t high, uint64_t low) + : val_((static_cast(high) << 64) | low) {} + + explicit Uint128(uint64_t value) : val_(value) {} + explicit Uint128(const Decimal& decimal) + : val_((static_cast(decimal.high()) << 64) | decimal.low()) {} + + uint64_t high() const { return static_cast(val_ >> 64); } + uint64_t low() const { return static_cast(val_ & kInt64Mask); } + + Uint128& operator+=(const Uint128& other) { + val_ += other.val_; + return *this; + } + + Uint128& operator*=(const Uint128& other) { + val_ *= other.val_; + return *this; + } + + uint128_t val_{}; +}; + +// Signed addition with well-defined behaviour on overflow (as unsigned) +template + requires std::is_signed_v +constexpr SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { + using UnsignedInt = std::make_unsigned_t; + return static_cast(static_cast(u) + + static_cast(v)); +} + +// Signed subtraction with well-defined behaviour on overflow (as unsigned) +template + requires std::is_signed_v +constexpr SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) { + using UnsignedInt = std::make_unsigned_t; + return static_cast(static_cast(u) - + static_cast(v)); +} + +// Signed negation with well-defined behaviour on overflow (as unsigned) +template + requires std::is_signed_v +constexpr SignedInt SafeSignedNegate(SignedInt u) { + using UnsignedInt = std::make_unsigned_t; + return static_cast(~static_cast(u) + 1); +} + +// Signed left shift with well-defined behaviour on negative numbers or overflow +template + requires std::is_signed_v && std::is_integral_v +constexpr SignedInt SafeLeftShift(SignedInt u, Shift shift) { + using UnsignedInt = std::make_unsigned_t; + return static_cast(static_cast(u) << shift); +} + +/// \brief Expands the given value into a big endian array of ints so that we can work on +/// it. The array will be converted to an absolute value. The array will remove leading +/// zeros from the value. +/// \param array a big endian array of length 4 to set with the value +/// \result the output length of the array +static inline int64_t FillInArray(const Decimal& value, uint32_t* array) { + Decimal abs_value = Decimal::Abs(value); + auto high = static_cast(abs_value.high()); + auto low = abs_value.low(); + + if (high != 0) { + if (high > std::numeric_limits::max()) { + array[0] = static_cast(high >> 32); + array[1] = static_cast(high); + array[2] = static_cast(low >> 32); + array[3] = static_cast(low); + return 4; + } + + array[0] = static_cast(high); + array[1] = static_cast(low >> 32); + array[2] = static_cast(low); + return 3; + } + + if (low > std::numeric_limits::max()) { + array[0] = static_cast(low >> 32); + array[1] = static_cast(low); + return 2; + } + + if (low == 0) { + return 0; + } + + array[0] = static_cast(low); + return 1; +} + +/// \brief Shift the number in the array left by bits positions. +static inline void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits > 0) { + for (int i = 0; i < length - 1; ++i) { + array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits)); + } + array[length - 1] <<= bits; + } +} + +/// \brief Shift the number in the array right by bits positions. +static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits > 0) { + for (int i = length - 1; i > 0; --i) { + array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits)); + } + array[0] >>= bits; + } +} + +/// \brief Fix the signs of the result and remainder after a division operation. +static inline void FixDivisionSigns(Decimal* result, Decimal* remainder, + bool dividend_negative, bool divisor_negative) { + if (dividend_negative != divisor_negative) { + result->Negate(); + } + if (dividend_negative) { + remainder->Negate(); + } +} + +/// \brief Build a Decimal from a big-endian array of uint32_t. +static Status BuildFromArray(Decimal* result, const uint32_t* array, int64_t length) { + // result_array[0] is low, result_array[1] is high + std::array result_array = {0, 0}; + for (int64_t i = length - 2 * 2 - 1; i >= 0; i--) { + if (array[i] != 0) { + return Overflow("Decimal division overflow"); + } + } + + // Construct the array in reverse order + int64_t next_index = length - 1; + for (size_t i = 0; i < 2 && next_index >= 0; i++) { + uint64_t lower_bits = array[next_index--]; + result_array[i] = (next_index < 0) + ? lower_bits + : (static_cast(lower_bits) << 32) | lower_bits; + } + + *result = Decimal(result_array[1], result_array[0]); + return {}; +} + +/// \brief Do a division where the divisor fits into a single 32-bit integer. +static inline Status SingleDivide(const uint32_t* dividend, int64_t dividend_length, + uint32_t divisor, bool dividend_negative, + bool divisor_negative, Decimal* result, + Decimal* remainder) { + uint64_t r = 0; + constexpr int64_t kDecimalArrayLength = Decimal::kBitWidth / sizeof(uint32_t) + 1; + std::array quotient; + for (int64_t i = 0; i < dividend_length; ++i) { + r <<= 32; + r |= dividend[i]; + if (r < divisor) { + quotient[i] = 0; + } else { + quotient[i] = static_cast(r / divisor); + r -= static_cast(quotient[i]) * divisor; + } + } + + ICEBERG_RETURN_UNEXPECTED(BuildFromArray(result, quotient.data(), dividend_length)); + + *remainder = static_cast(r); + FixDivisionSigns(result, remainder, dividend_negative, divisor_negative); + return {}; +} + +/// \brief Divide two Decimal values and return the result and remainder. +static inline Status DecimalDivide(const Decimal& dividend, const Decimal& divisor, + Decimal* result, Decimal* remainder) { + constexpr int64_t kDecimalArrayLength = Decimal::kBitWidth / sizeof(uint32_t); + // Split the dividend and divisor into 32-bit integer pieces so that we can work on + // them. + std::array dividend_array; + std::array divisor_array; + bool dividend_negative = dividend.high() < 0; + bool divisor_negative = divisor.high() < 0; + + // leave an extra zero before the dividend + dividend_array[0] = 0; + int64_t dividend_length = FillInArray(dividend, dividend_array.data() + 1) + 1; + int64_t divisor_length = FillInArray(divisor, divisor_array.data()); + + if (dividend_length <= divisor_length) { + *remainder = dividend; + *result = 0; + return {}; + } + + if (divisor_length == 0) { + return DivideByZero("Cannot divide by zero in DecimalDivide"); + } + + if (divisor_length == 1) { + ICEBERG_RETURN_UNEXPECTED(SingleDivide(dividend_array.data(), dividend_length, + divisor_array[0], dividend_negative, + divisor_negative, result, remainder)); + return {}; + } + + int64_t result_length = dividend_length - divisor_length; + std::array result_array; + + assert(result_length < kDecimalArrayLength); + + // Normalize by shifting both by a multiple of 2 so that the digit guessing is easier. + // The requirement is that divisor_array[0] is greater than 2**31. + int64_t normalize_bits = std::countl_zero(divisor_array[0]); + ShiftArrayLeft(divisor_array.data(), divisor_length, normalize_bits); + ShiftArrayLeft(dividend_array.data(), dividend_length, normalize_bits); + + // compute each digit in the result + for (int64_t i = 0; i < result_length; ++i) { + // Guess the next digit. At worst it is two too large + uint32_t guess = std::numeric_limits::max(); + const auto high_dividend = + static_cast(dividend_array[i]) << 32 | dividend_array[i + 1]; + if (dividend_array[i] != divisor_array[0]) { + guess = static_cast(high_dividend / divisor_array[0]); + } + + // catch all the cases where the guess is two too large and most of the cases where it + // is one too large + auto rhat = static_cast(high_dividend - + guess * static_cast(divisor_array[0])); + while (static_cast(divisor_array[1]) * guess > + (static_cast(rhat) << 32) + dividend_array[i + 2]) { + guess--; + rhat += divisor_array[0]; + if (static_cast(rhat) < divisor_array[1]) { + break; + } + } + + // substract off the guess * divisor from the dividend + uint64_t mult = 0; + for (int64_t j = divisor_length - 1; j >= 0; --j) { + mult += static_cast(guess) * divisor_array[j]; + uint32_t prev = dividend_array[i + j + 1]; + dividend_array[i + j + 1] -= static_cast(mult); + mult >>= 32; + if (dividend_array[i + j + 1] > prev) { + ++mult; + } + } + uint32_t prev = dividend_array[i]; + dividend_array[i] -= static_cast(mult); + + // if guess was too big, we add back divisor + if (dividend_array[i] > prev) { + guess--; + uint32_t carry = 0; + for (int64_t j = divisor_length - 1; j >= 0; --j) { + const auto sum = + static_cast(divisor_array[j]) + dividend_array[i + j + 1] + carry; + dividend_array[i + j + 1] = static_cast(sum); + carry = static_cast(sum >> 32); + } + dividend_array[i] += carry; + } + + result_array[i] = guess; + } + + // denormalize the remainder + ShiftArrayRight(dividend_array.data(), dividend_length, normalize_bits); + + // return result and remainder + ICEBERG_RETURN_UNEXPECTED(BuildFromArray(result, result_array.data(), result_length)); + ICEBERG_RETURN_UNEXPECTED( + BuildFromArray(remainder, dividend_array.data(), dividend_length)); + + FixDivisionSigns(result, remainder, dividend_negative, divisor_negative); + return {}; +} + +/// \brief Powers of ten for Decimal with scale from 0 to 38. +constexpr std::array kDecimal128PowersOfTen = { + Decimal(1LL), + Decimal(10LL), + Decimal(100LL), + Decimal(1000LL), + Decimal(10000LL), + Decimal(100000LL), + Decimal(1000000LL), + Decimal(10000000LL), + Decimal(100000000LL), + Decimal(1000000000LL), + Decimal(10000000000LL), + Decimal(100000000000LL), + Decimal(1000000000000LL), + Decimal(10000000000000LL), + Decimal(100000000000000LL), + Decimal(1000000000000000LL), + Decimal(10000000000000000LL), + Decimal(100000000000000000LL), + Decimal(1000000000000000000LL), + Decimal(0LL, 10000000000000000000ULL), + Decimal(5LL, 7766279631452241920ULL), + Decimal(54LL, 3875820019684212736ULL), + Decimal(542LL, 1864712049423024128ULL), + Decimal(5421LL, 200376420520689664ULL), + Decimal(54210LL, 2003764205206896640ULL), + Decimal(542101LL, 1590897978359414784ULL), + Decimal(5421010LL, 15908979783594147840ULL), + Decimal(54210108LL, 11515845246265065472ULL), + Decimal(542101086LL, 4477988020393345024ULL), + Decimal(5421010862LL, 7886392056514347008ULL), + Decimal(54210108624LL, 5076944270305263616ULL), + Decimal(542101086242LL, 13875954555633532928ULL), + Decimal(5421010862427LL, 9632337040368467968ULL), + Decimal(54210108624275LL, 4089650035136921600ULL), + Decimal(542101086242752LL, 4003012203950112768ULL), + Decimal(5421010862427522LL, 3136633892082024448ULL), + Decimal(54210108624275221LL, 12919594847110692864ULL), + Decimal(542101086242752217LL, 68739955140067328ULL), + Decimal(5421010862427522170LL, 687399551400673280ULL)}; +} // namespace + +Decimal::Decimal(std::string_view str) { + auto result = Decimal::FromString(str); + if (!result) { + throw std::runtime_error( + std::format("Failed to parse Decimal from string: {}, error: {}", str, + result.error().message)); + } + *this = std::move(result.value()); +} + +Decimal& Decimal::Negate() { + uint64_t result_low = ~low() + 1; + int64_t result_high = ~high(); + if (result_low == 0) { + result_high = SafeSignedAdd(result_high, 1); + } + *this = Decimal(result_high, result_low); + return *this; +} + +Decimal& Decimal::Abs() { return *this < 0 ? Negate() : *this; } + +Decimal Decimal::Abs(const Decimal& value) { + Decimal result(value); + return result.Abs(); +} + +Decimal& Decimal::operator+=(const Decimal& other) { + int64_t result_high = SafeSignedAdd(high(), other.high()); + uint64_t result_low = low() + other.low(); + result_high = SafeSignedAdd(result_high, result_low < low()); + *this = Decimal(result_high, result_low); + return *this; +} + +Decimal& Decimal::operator-=(const Decimal& other) { + int64_t result_high = SafeSignedSubtract(high(), other.high()); + uint64_t result_low = low() - other.low(); + result_high = SafeSignedSubtract(result_high, result_low > low()); + *this = Decimal(result_high, result_low); + return *this; +} + +Decimal& Decimal::operator*=(const Decimal& other) { + // Since the max value of Decimal is supposed to be 1e38 - 1 and the min the + // negation taking the abosolute values here should aways be safe. + const bool negate = Sign() != other.Sign(); + Decimal x = Decimal::Abs(*this); + Decimal y = Decimal::Abs(other); + + Uint128 r(x); + r *= Uint128(y); + + *this = Decimal(static_cast(r.high()), r.low()); + if (negate) { + Negate(); + } + return *this; +} + +Result> Decimal::Divide(const Decimal& divisor) const { + std::pair result; + ICEBERG_RETURN_UNEXPECTED(DecimalDivide(*this, divisor, &result.first, &result.second)); + return result; +} + +Decimal& Decimal::operator/=(const Decimal& other) { + Decimal remainder; + auto s = DecimalDivide(*this, other, this, &remainder); + assert(s); + return *this; +} + +Decimal& Decimal::operator|=(const Decimal& other) { + data_[0] |= other.data_[0]; + data_[1] |= other.data_[1]; + return *this; +} + +Decimal& Decimal::operator&=(const Decimal& other) { + data_[0] &= other.data_[0]; + data_[1] &= other.data_[1]; + return *this; +} + +Decimal& Decimal::operator<<=(uint32_t shift) { + if (shift != 0) { + uint64_t low_; + int64_t high_; + if (shift < 64) { + high_ = SafeLeftShift(high(), shift); + high_ |= low() >> (64 - shift); + low_ = low() << shift; + } else if (shift < 128) { + high_ = static_cast(low() << (shift - 64)); + low_ = 0; + } else { + high_ = 0; + low_ = 0; + } + *this = Decimal(high_, low_); + } + + return *this; +} + +Decimal& Decimal::operator>>=(uint32_t shift) { + if (shift != 0) { + uint64_t low_; + int64_t high_; + if (shift < 64) { + low_ = low() >> shift; + low_ |= static_cast(high()) << (64 - shift); + high_ = high() >> shift; + } else if (shift < 128) { + low_ = static_cast(high() >> (shift - 64)); + high_ = high() >> 63; + } else { + high_ = high() >> 63; + low_ = static_cast(high_); + } + *this = Decimal(high_, low_); + } + + return *this; +} + +namespace { + +struct DecimalComponents { + std::string_view while_digits; + std::string_view fractional_digits; + int32_t exponent; + char sign{0}; + bool has_exponent{false}; +}; + +inline bool IsSign(char c) { return c == '+' || c == '-'; } + +inline bool IsDigit(char c) { return c >= '0' && c <= '9'; } + +inline bool IsDot(char c) { return c == '.'; } + +inline bool StartsExponent(char c) { return c == 'e' || c == 'E'; } + +inline size_t ParseDigitsRun(std::string_view str, size_t pos, std::string_view* out) { + size_t start = pos; + while (pos < str.size() && IsDigit(str[pos])) { + ++pos; + } + *out = str.substr(start, pos - start); + return pos; +} + +bool ParseDecimalComponents(std::string_view str, DecimalComponents* out) { + size_t pos = 0; + + if (str.empty()) { + return false; + } + + // Sign of the number + if (IsSign(str[pos])) { + out->sign = str[pos++]; + } + // First run of digits + pos = ParseDigitsRun(str, pos, &out->while_digits); + if (pos == str.size()) { + return !out->while_digits.empty(); + } + + // Optional dot + if (IsDot(str[pos])) { + ++pos; + // Second run of digits after the dot + pos = ParseDigitsRun(str, pos, &out->fractional_digits); + } + if (out->fractional_digits.empty() && out->while_digits.empty()) { + // Need at least some digits (whole or fractional) + return false; + } + if (pos == str.size()) { + return true; + } + + // Optional exponent part + if (StartsExponent(str[pos])) { + ++pos; + // Skip '+' sign, '-' sign will be handled by from_chars + if (pos < str.size() && str[pos] == '+') { + ++pos; + } + out->has_exponent = true; + auto [ptr, ec] = + std::from_chars(str.data() + pos, str.data() + str.size(), out->exponent); + if (ec != std::errc()) { + return false; // Failed to parse exponent + } + pos = ptr - str.data(); + } + + return pos == str.size(); +} + +constexpr auto kInt64DecimalDigits = + static_cast(std::numeric_limits::digits10); + +constexpr std::array kUInt64PowersOfTen = { + // clang-format off + 1ULL, + 10ULL, + 100ULL, + 1000ULL, + 10000ULL, + 100000ULL, + 1000000ULL, + 10000000ULL, + 100000000ULL, + 1000000000ULL, + 10000000000ULL, + 100000000000ULL, + 1000000000000ULL, + 10000000000000ULL, + 100000000000000ULL, + 1000000000000000ULL, + 10000000000000000ULL, + 100000000000000000ULL, + 1000000000000000000ULL + // clang-format on +}; + +static inline void ShiftAndAdd(std::string_view input, std::array& out) { + for (size_t pos = 0; pos < input.size();) { + const size_t group_size = std::min(kInt64DecimalDigits, input.size() - pos); + const uint64_t multiple = kUInt64PowersOfTen[group_size]; + uint64_t value = 0; + + std::from_chars(input.data() + pos, input.data() + pos + group_size, value); + + for (size_t i = 0; i < 2; ++i) { + Uint128 tmp(out[i]); + tmp *= Uint128(multiple); + tmp += Uint128(value); + out[i] = tmp.low(); + value = tmp.high(); + } + pos += group_size; + } +} + +static void AppendLittleEndianArrayToString(const std::array& array, + std::string* out) { + // TODO(zhjwpku): Implementation this. +} + +} // namespace + +Result Decimal::ToString(int32_t scale) const { + if (scale < -kMaxScale || scale > kMaxScale) { + return InvalidArgument( + "Decimal::ToString: scale must be in the range [-{}, {}], was {}", kMaxScale, + kMaxScale, scale); + } + return NotImplemented("Decimal::ToString is not implemented yet"); +} + +std::string Decimal::ToIntegerString() const { + std::string result; + if (high() < 0) { + result.push_back('-'); + Decimal abs = *this; + abs.Negate(); + AppendLittleEndianArrayToString({abs.low(), static_cast(abs.high())}, + &result); + } else { + AppendLittleEndianArrayToString({low(), static_cast(high())}, &result); + } + return result; +} + +Result Decimal::FromString(std::string_view str, int32_t* precision, + int32_t* scale) { + if (str.empty()) { + return InvalidArgument("Decimal::FromString: empty string is not a valid Decimal"); + } + DecimalComponents dec; + if (!ParseDecimalComponents(str, &dec)) { + return InvalidArgument("Decimal::FromString: invalid decimal string '{}'", str); + } + + // Count number of significant digits (without leading zeros) + size_t first_non_zero = dec.while_digits.find_first_not_of('0'); + size_t significant_digits = dec.fractional_digits.size(); + if (first_non_zero != std::string_view::npos) { + significant_digits += dec.while_digits.size() - first_non_zero; + } + + auto parsed_precision = static_cast(significant_digits); + + int32_t parsed_scale = 0; + if (dec.has_exponent) { + auto adjusted_exponent = dec.exponent; + parsed_scale = static_cast(dec.fractional_digits.size()) - adjusted_exponent; + } else { + parsed_scale = static_cast(dec.fractional_digits.size()); + } + + // Index 1 is the high part, index 0 is the low part + std::array result_array = {0, 0}; + ShiftAndAdd(dec.while_digits, result_array); + ShiftAndAdd(dec.fractional_digits, result_array); + Decimal result(static_cast(result_array[1]), result_array[0]); + + if (dec.sign == '-') { + result.Negate(); + } + + std::cout << result_array[1] << " " << result_array[0] << std::endl; + + if (parsed_scale < 0) { + // For the scale to 0, to avoid negative scales (due to compatibility issues with + // external systems such as databases) + if (parsed_scale < -kMaxScale) { + return InvalidArgument( + "Decimal::FromString: scale must be in the range [-{}, {}], was {}", kMaxScale, + kMaxScale, parsed_scale); + } + + result *= kDecimal128PowersOfTen[-parsed_scale]; + parsed_precision -= parsed_scale; + parsed_scale = 0; + } + + if (precision != nullptr) { + *precision = parsed_precision; + } + if (scale != nullptr) { + *scale = parsed_scale; + } + + return result; +} + +template + requires std::is_floating_point_v +Result Decimal::FromReal(T value, int32_t precision, int32_t scale) { + if (precision < 1 || precision > kMaxPrecision) { + return InvalidArgument( + "Decimal::FromReal: precision must be in the range [1, {}], " + "was {}", + kMaxPrecision, precision); + } + if (scale > precision) { + return InvalidArgument( + "Decimal::FromReal: scale must be less than or equal to " + "precision, was scale = {}, precision = {}", + scale, precision); + } + + return NotImplemented("Decimal::FromReal is not implemented yet"); +} + +Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { + if (orig_scale < 0 || orig_scale > kMaxScale) { + return InvalidArgument( + "Decimal::Rescale: original scale must be in the range [0, {}], " + "was {}", + kMaxScale, orig_scale); + } + if (new_scale < 0 || new_scale > kMaxScale) { + return InvalidArgument( + "Decimal::Rescale: new scale must be in the range [0, {}], " + "was {}", + kMaxScale, new_scale); + } + + if (orig_scale == new_scale) { + return *this; + } + + return NotImplemented("Decimal::Rescale is not implemented yet"); +} + +template + requires std::is_floating_point_v +T Decimal::ToReal(int32_t scale) const { + if (scale < -kMaxScale || scale > kMaxScale) { + throw InvalidArgument("Decimal::ToReal: scale must be in the range [-{}, {}], was {}", + kMaxScale, kMaxScale, scale); + } + + // Convert Decimal to floating-point value with the given scale. + // This is a placeholder implementation. + return T(0); // Replace with actual conversion logic. +} + +// Unary operators +ICEBERG_EXPORT Decimal operator-(const Decimal& operand) { + Decimal result(operand.high(), operand.low()); + return result.Negate(); +} + +ICEBERG_EXPORT Decimal operator~(const Decimal& operand) { + return {~operand.high(), ~operand.low()}; +} + +// Binary operators +ICEBERG_EXPORT Decimal operator+(const Decimal& lhs, const Decimal& rhs) { + Decimal result(lhs); + result += rhs; + return result; +} + +ICEBERG_EXPORT Decimal operator-(const Decimal& lhs, const Decimal& rhs) { + Decimal result(lhs); + result -= rhs; + return result; +} + +ICEBERG_EXPORT Decimal operator*(const Decimal& lhs, const Decimal& rhs) { + Decimal result(lhs); + result *= rhs; + return result; +} + +ICEBERG_EXPORT Decimal operator/(const Decimal& lhs, const Decimal& rhs) { + auto [result, _] = lhs.Divide(rhs).value(); + return result; +} + +ICEBERG_EXPORT Decimal operator%(const Decimal& lhs, const Decimal& rhs) { + auto [_, remainder] = lhs.Divide(rhs).value(); + return remainder; +} + +ICEBERG_EXPORT bool operator<(const Decimal& lhs, const Decimal& rhs) { + return (lhs.high() < rhs.high()) || (lhs.high() == rhs.high() && lhs.low() < rhs.low()); +} + +ICEBERG_EXPORT bool operator<=(const Decimal& lhs, const Decimal& rhs) { + return !operator>(lhs, rhs); +} + +ICEBERG_EXPORT bool operator>(const Decimal& lhs, const Decimal& rhs) { + return operator<(rhs, lhs); +} + +ICEBERG_EXPORT bool operator>=(const Decimal& lhs, const Decimal& rhs) { + return !operator<(lhs, rhs); +} + +} // namespace iceberg diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/expression/decimal.h new file mode 100644 index 000000000..c4672bbde --- /dev/null +++ b/src/iceberg/expression/decimal.h @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/util/port.h" + +namespace iceberg { + +/// \brief Represents 128-bit fixed-point decimal numbers. +/// The max decimal precision that can be safely represented is +/// 38 significant digits. +class ICEBERG_EXPORT Decimal { + public: + static constexpr int kBitWidth = 128; + static constexpr int kByteWidth = kBitWidth / 8; + static constexpr int kMaxPrecision = 38; + static constexpr int kMaxScale = 38; + + /// \brief Default constructor initializes to zero. + constexpr Decimal() noexcept : data_{0, 0} {} + + /// \brief Construct a Decimal from an array of 2 64-bit integers. + explicit Decimal(std::array data) noexcept : data_(std::move(data)) {} + + /// \brief Create a Decimal from any integer not wider than 64 bits. + template + requires(std::is_integral_v && (sizeof(T) <= sizeof(uint64_t))) + constexpr Decimal(T value) noexcept // NOLINT + { + if (value < T{}) { + data_[kHighIndex] = ~static_cast(0); + } else { + data_[kHighIndex] = 0; + } + data_[kLowIndex] = static_cast(value); + } + + /// \brief Parse a Decimal from a string representation. + explicit Decimal(std::string_view str); + +/// \brief Create a Decimal from two 64-bit integers. +#if ICEBERG_LITTLE_ENDIAN + constexpr Decimal(int64_t high, uint64_t low) noexcept + : data_{low, static_cast(high)} {} +#else + constexpr Decimal(int64_t high, uint64_t low) noexcept + : data_{static_cast(high), low} {} +#endif + + /// \brief Negate the current Decimal value (in place) + Decimal& Negate(); + + /// \brief Absolute value of the current Decimal value (in place) + Decimal& Abs(); + + /// \brief Absolute value of the current Decimal value + static Decimal Abs(const Decimal& value); + + /// \brief Add a number to this one. The result is truncated to 128 bits. + Decimal& operator+=(const Decimal& other); + + /// \brief Subtract a number from this one. The result is truncated to 128 bits. + Decimal& operator-=(const Decimal& other); + + /// \brief Multiply this number by another. The result is truncated to 128 bits. + Decimal& operator*=(const Decimal& other); + + /// \brief Divide this number by another. + /// + /// The operation does not modify the current Decimal value. + /// The answer rounds towards zero. Signs work like: + /// 21 / 5 -> 4, 1 + /// -21 / 5 -> -4, -1 + /// 21 / -5 -> -4, 1 + /// -21 / -5 -> 4, -1 + /// \param[in] divisor the number to divide by + /// \return the pair of the quotient and the remainder + Result> Divide(const Decimal& divisor) const; + + /// \brief In place division. + Decimal& operator/=(const Decimal& other); + + /// \brief Bitwise OR operation. + Decimal& operator|=(const Decimal& other); + + /// \brief Bitwise AND operation. + Decimal& operator&=(const Decimal& other); + + /// \brief Shift left by the given number of bits (in place). + Decimal& operator<<=(uint32_t shift); + + /// \brief Shift left by the given number of bits. + Decimal operator<<(uint32_t shift) const { + Decimal result(*this); + result <<= shift; + return result; + } + + /// \brief Shift right by the given number of bits (in place). + Decimal& operator>>=(uint32_t shift); + + /// \brief Shift right by the given number of bits. + Decimal operator>>(uint32_t shift) const { + Decimal result(*this); + result >>= shift; + return result; + } + + /// \brief Get the high bits of the two's complement representation of the number. + constexpr int64_t high() const { return static_cast(data_[kHighIndex]); } + + /// \brief Get the low bits of the two's complement representation of the number. + constexpr uint64_t low() const { return data_[kLowIndex]; } + + /// \brief Convert the Decimal value to a base 10 decimal string with the given scale. + /// \param scale The scale to use for the string representation. + /// \return The string representation of the Decimal value. + Result ToString(int32_t scale = 0) const; + + /// \brief Convert the Decimal value to an integer string. + std::string ToIntegerString() const; + + /// \brief Convert the decimal string to a Decimal value, optionally including precision + /// and scale if they are provided not null. + static Result FromString(std::string_view str, int32_t* precision = nullptr, + int32_t* scale = nullptr); + + /// \brief Convert the floating-point value to a Decimal value with the given + /// precision and scale. + template + requires std::is_floating_point_v + static Result FromReal(T value, int32_t precision, int32_t scale); + + /// \brief Convert Decimal from one scale to another. + Result Rescale(int32_t orig_scale, int32_t new_scale) const; + + /// \brief Convert the Decimal value to a floating-point value with the given scale. + /// \param scale The scale to use for the conversion. + /// \return The floating-point value. + template + requires std::is_floating_point_v + T ToReal(int32_t scale = 0) const; + + /// \brief Returns 1 if positive or zero, -1 if strictly negative. + int64_t Sign() const { return 1 | (static_cast(data_[kHighIndex]) >> 63); } + + /// \brief Check if the Decimal value is negative. + bool IsNegative() const { return static_cast(data_[kHighIndex]) < 0; } + + friend bool operator==(const Decimal& lhs, const Decimal& rhs) { + return lhs.data_ == rhs.data_; + } + + friend bool operator!=(const Decimal& lhs, const Decimal& rhs) { + return lhs.data_ != rhs.data_; + } + + private: +#if ICEBERG_LITTLE_ENDIAN + static constexpr int kHighIndex = 1; + static constexpr int kLowIndex = 0; +#else + static constexpr int kHighIndex = 0; + static constexpr int kLowIndex = 1; +#endif + + std::array data_; +}; + +// Unary operators +ICEBERG_EXPORT Decimal operator-(const Decimal& operand); +ICEBERG_EXPORT Decimal operator~(const Decimal& operand); + +// Binary operators +ICEBERG_EXPORT bool operator<=(const Decimal& lhs, const Decimal& rhs); +ICEBERG_EXPORT bool operator<(const Decimal& lhs, const Decimal& rhs); +ICEBERG_EXPORT bool operator>=(const Decimal& lhs, const Decimal& rhs); +ICEBERG_EXPORT bool operator>(const Decimal& lhs, const Decimal& rhs); + +ICEBERG_EXPORT Decimal operator+(const Decimal& lhs, const Decimal& rhs); +ICEBERG_EXPORT Decimal operator-(const Decimal& lhs, const Decimal& rhs); +ICEBERG_EXPORT Decimal operator*(const Decimal& lhs, const Decimal& rhs); +ICEBERG_EXPORT Decimal operator/(const Decimal& lhs, const Decimal& rhs); +ICEBERG_EXPORT Decimal operator%(const Decimal& lhs, const Decimal& rhs); + +} // namespace iceberg diff --git a/src/iceberg/result.h b/src/iceberg/result.h index d1aa4cedd..a70ebac5a 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -32,6 +32,7 @@ enum class ErrorKind { kAlreadyExists, kCommitStateUnknown, kDecompressError, + kDivideByZero, kInvalidArgument, kInvalidArrowData, kInvalidExpression, @@ -46,6 +47,8 @@ enum class ErrorKind { kNotFound, kNotImplemented, kNotSupported, + kOverflow, + kRescaleDataLoss, kUnknownError, }; @@ -79,6 +82,7 @@ using Status = Result; DEFINE_ERROR_FUNCTION(AlreadyExists) DEFINE_ERROR_FUNCTION(CommitStateUnknown) DEFINE_ERROR_FUNCTION(DecompressError) +DEFINE_ERROR_FUNCTION(DivideByZero) DEFINE_ERROR_FUNCTION(InvalidArgument) DEFINE_ERROR_FUNCTION(InvalidArrowData) DEFINE_ERROR_FUNCTION(InvalidExpression) @@ -93,6 +97,8 @@ DEFINE_ERROR_FUNCTION(NotAllowed) DEFINE_ERROR_FUNCTION(NotFound) DEFINE_ERROR_FUNCTION(NotImplemented) DEFINE_ERROR_FUNCTION(NotSupported) +DEFINE_ERROR_FUNCTION(Overflow) +DEFINE_ERROR_FUNCTION(RescaleDataLoss) DEFINE_ERROR_FUNCTION(UnknownError) #undef DEFINE_ERROR_FUNCTION diff --git a/src/iceberg/util/port.h b/src/iceberg/util/port.h new file mode 100644 index 000000000..bdd65520b --- /dev/null +++ b/src/iceberg/util/port.h @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/// \file iceberg/util/port.h +/// \brief Portability macros and definitions for Iceberg C++ library + +#pragma once + +#if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define ICEBERG_LITTLE_ENDIAN 1 +#elif defined(__BIG_ENDIAN__) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define ICEBERG_LITTLE_ENDIAN 0 +#else +# error "Unsupported or unknown endianness" +#endif + +#if defined(_MSC_VER) +# include <__msvc_int128.hpp> +using int128_t = std::_Signed128; +using uint128_t = std::_Unsigned128; +#elif defined(__GNUC__) || defined(__clang__) +using int128_t = __int128; +using uint128_t = unsigned __int128; +#else +# error "128-bit integer type is not supported on this platform" +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 42ad13209..e0583468b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -75,7 +75,11 @@ add_iceberg_test(table_test table_test.cc schema_json_test.cc) -add_iceberg_test(expression_test SOURCES expression_test.cc literal_test.cc) +add_iceberg_test(expression_test + SOURCES + decimal_test.cc + expression_test.cc + literal_test.cc) add_iceberg_test(json_serde_test SOURCES diff --git a/test/decimal_test.cc b/test/decimal_test.cc new file mode 100644 index 000000000..41c78896c --- /dev/null +++ b/test/decimal_test.cc @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "iceberg/expression/decimal.h" + +#include + +#include "gmock/gmock.h" +#include "matchers.h" + +namespace iceberg { + +namespace { + +void AssertDecimalFromString(const std::string& s, const Decimal& expected, + int32_t expected_precision, int32_t expected_scale) { + int32_t precision = 0; + int32_t scale = 0; + auto result = Decimal::FromString(s, &precision, &scale); + EXPECT_THAT(result, IsOk()); + const Decimal& actual = result.value(); + EXPECT_EQ(expected, actual); + EXPECT_EQ(expected_precision, precision); + EXPECT_EQ(expected_scale, scale); +} + +} // namespace + +TEST(DecimalTest, Basics) { + AssertDecimalFromString("234.23445", Decimal(23423445), 8, 5); + + std::string string_value("-23049223942343532412"); + Decimal result(string_value); + Decimal expected(static_cast(-230492239423435324)); + ASSERT_EQ(result, expected * 100 - 12); + ASSERT_NE(result.high(), 0); + + result = Decimal("-23049223942343.532412"); + ASSERT_EQ(result, expected * 100 - 12); + ASSERT_NE(result.high(), 0); +} + +TEST(DecimalTest, StringStartingWithSign) { + AssertDecimalFromString("+234.567", Decimal(234567), 6, 3); + AssertDecimalFromString("+2342394230592.232349023094", + Decimal("2342394230592232349023094"), 25, 12); + AssertDecimalFromString("-234.567", Decimal("-234567"), 6, 3); + AssertDecimalFromString("-2342394230592.232349023094", + Decimal("-2342394230592232349023094"), 25, 12); +} + +TEST(DecimalTest, StringWithLeadingZeros) { + AssertDecimalFromString("0000000000000000000000000000000.234", Decimal(234), 3, 3); + AssertDecimalFromString("0000000000000000000000000000000.23400", Decimal(23400), 5, 5); + AssertDecimalFromString("234.00", Decimal(23400), 5, 2); + AssertDecimalFromString("234.0", Decimal(2340), 4, 1); + AssertDecimalFromString("0000000", Decimal(0), 0, 0); + AssertDecimalFromString("000.0000", Decimal(0), 4, 4); + AssertDecimalFromString(".00000", Decimal(0), 5, 5); +} + +TEST(DecimalTest, DecimalWithExponent) { + AssertDecimalFromString("1E1", Decimal(10), 2, 0); + AssertDecimalFromString("234.23445e2", Decimal(23423445), 8, 3); + AssertDecimalFromString("234.23445e-2", Decimal(23423445), 8, 7); + AssertDecimalFromString("234.23445E2", Decimal(23423445), 8, 3); + AssertDecimalFromString("234.23445E-2", Decimal(23423445), 8, 7); + AssertDecimalFromString("1.23E-8", Decimal(123), 3, 10); +} + +TEST(DecimalTest, SmallValues) { + struct TestValue { + std::string s; + int64_t expected; + int32_t expected_precision; + int32_t expected_scale; + }; + + for (const auto& tv : std::vector{ + {.s = "12.3", .expected = 123LL, .expected_precision = 3, .expected_scale = 1}, + {.s = "0.00123", + .expected = 123LL, + .expected_precision = 5, + .expected_scale = 5}, + {.s = "1.23E-8", + .expected = 123LL, + .expected_precision = 3, + .expected_scale = 10}, + {.s = "-1.23E-8", + .expected = -123LL, + .expected_precision = 3, + .expected_scale = 10}, + {.s = "1.23E+3", + .expected = 1230LL, + .expected_precision = 4, + .expected_scale = 0}, + {.s = "-1.23E+3", + .expected = -1230LL, + .expected_precision = 4, + .expected_scale = 0}, + {.s = "1.23E+5", + .expected = 123000LL, + .expected_precision = 6, + .expected_scale = 0}, + {.s = "1.2345E+7", + .expected = 12345000LL, + .expected_precision = 8, + .expected_scale = 0}, + {.s = "1.23e-8", + .expected = 123LL, + .expected_precision = 3, + .expected_scale = 10}, + {.s = "-1.23e-8", + .expected = -123LL, + .expected_precision = 3, + .expected_scale = 10}, + {.s = "1.23e+3", + .expected = 1230LL, + .expected_precision = 4, + .expected_scale = 0}, + {.s = "-1.23e+3", + .expected = -1230LL, + .expected_precision = 4, + .expected_scale = 0}, + {.s = "1.23e+5", + .expected = 123000LL, + .expected_precision = 6, + .expected_scale = 0}, + {.s = "1.2345e+7", + .expected = 12345000LL, + .expected_precision = 8, + .expected_scale = 0}}) { + AssertDecimalFromString(tv.s, Decimal(tv.expected), tv.expected_precision, + tv.expected_scale); + } +} + +} // namespace iceberg From f43023d1d340442fc993a4005be7b96a81b429c9 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 18 Aug 2025 23:44:25 +0800 Subject: [PATCH 02/20] feat: add ToIntegerString --- src/iceberg/expression/decimal.cc | 73 ++++++++++++++++++++++++++++--- test/decimal_test.cc | 14 ++++++ 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index e5ac492ef..a79a957e5 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -19,6 +19,7 @@ #include "iceberg/expression/decimal.h" +#include #include #include #include @@ -26,8 +27,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -622,9 +625,69 @@ static inline void ShiftAndAdd(std::string_view input, std::array& } } +// Returns a mask for the bit_index lower order bits. +// Only valid for bit_index in the range [0, 64). +constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) { + return (static_cast(1) << bit_index) - 1; +} + static void AppendLittleEndianArrayToString(const std::array& array, std::string* out) { - // TODO(zhjwpku): Implementation this. + const auto most_significant_non_zero = std::ranges::find_if( + array.rbegin(), array.rend(), [](uint64_t v) { return v != 0; }); + if (most_significant_non_zero == array.rend()) { + out->push_back('0'); + return; + } + + size_t most_significant_elem_idx = &*most_significant_non_zero - array.data(); + std::array copy = array; + constexpr uint32_t k1e9 = 1000000000U; + constexpr size_t kNumBits = 128; + // Segments will contain the array split into groups that map to decimal digits, in + // little endian order. Each segment will hold at most 9 decimal digits. For example, if + // the input represents 9876543210123456789, then segments will be [123456789, + // 876543210, 9]. + // The max number of segments needed = ceil(kNumBits * log(2) / log(1e9)) + // = ceil(kNumBits / 29.897352854) <= ceil(kNumBits / 29). + std::array segments; + size_t num_segments = 0; + uint64_t* most_significant_elem = ©[most_significant_elem_idx]; + + std::cout << copy[1] << " " << copy[0] << std::endl; + do { + // Compute remainder = copy % 1e9 and copy = copy / 1e9. + uint32_t remainder = 0; + uint64_t* elem = most_significant_elem; + do { + // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); + // *elem = dividend / 1e9; + // remainder = dividend % 1e9. + auto hi = static_cast(*elem >> 32); + auto lo = static_cast(*elem & LeastSignificantBitMask(32)); + uint64_t dividend_hi = (static_cast(remainder) << 32) | hi; + uint64_t quotient_hi = dividend_hi / k1e9; + remainder = static_cast(dividend_hi % k1e9); + uint64_t dividend_lo = (static_cast(remainder) << 32) | lo; + uint64_t quotient_lo = dividend_lo / k1e9; + remainder = static_cast(dividend_lo % k1e9); + *elem = (quotient_hi << 32) | quotient_lo; + } while (elem-- != copy.data()); + + segments[num_segments++] = remainder; + } while (*most_significant_elem != 0 || most_significant_elem-- != copy.data()); + + const uint32_t* segment = &segments[num_segments - 1]; + std::stringstream oss; + // First segment is formatted as-is. + oss << *segment; + // Remaining segments are formatted with leading zeros to fill 9 digits. e.g. 123 is + // formatted as "000000123" + while (segment != segments.data()) { + --segment; + oss << std::setfill('0') << std::setw(9) << *segment; + } + out->append(oss.str()); } } // namespace @@ -689,8 +752,6 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, result.Negate(); } - std::cout << result_array[1] << " " << result_array[0] << std::endl; - if (parsed_scale < 0) { // For the scale to 0, to avoid negative scales (due to compatibility issues with // external systems such as databases) @@ -798,13 +859,11 @@ ICEBERG_EXPORT Decimal operator*(const Decimal& lhs, const Decimal& rhs) { } ICEBERG_EXPORT Decimal operator/(const Decimal& lhs, const Decimal& rhs) { - auto [result, _] = lhs.Divide(rhs).value(); - return result; + return lhs.Divide(rhs).value().first; } ICEBERG_EXPORT Decimal operator%(const Decimal& lhs, const Decimal& rhs) { - auto [_, remainder] = lhs.Divide(rhs).value(); - return remainder; + return lhs.Divide(rhs).value().second; } ICEBERG_EXPORT bool operator<(const Decimal& lhs, const Decimal& rhs) { diff --git a/test/decimal_test.cc b/test/decimal_test.cc index 41c78896c..5b140c4c5 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -150,4 +150,18 @@ TEST(DecimalTest, SmallValues) { } } +TEST(DecimalTest, LargeValues) { + const std::array string_values = { + "99999999999999999999999999999999999999", "-99999999999999999999999999999999999999", + "170141183460469231731687303715884105727", // maximum positive value + "-170141183460469231731687303715884105728" // minimum negative value + }; + + for (const auto& s : string_values) { + const Decimal value(s); + const std::string printed_value = value.ToIntegerString(); + EXPECT_EQ(printed_value, s) << "Expected: " << s << ", but got: " << printed_value; + } +} + } // namespace iceberg From 4ebb2812f9bc08f695c73f2b4e24ecac1a617038 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 20 Aug 2025 00:36:44 +0800 Subject: [PATCH 03/20] feat: add ToString --- src/iceberg/expression/decimal.cc | 77 ++++++++- test/decimal_test.cc | 251 ++++++++++++++++++++++++++++++ 2 files changed, 323 insertions(+), 5 deletions(-) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index a79a957e5..eadab243a 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -188,9 +188,10 @@ static Status BuildFromArray(Decimal* result, const uint32_t* array, int64_t len int64_t next_index = length - 1; for (size_t i = 0; i < 2 && next_index >= 0; i++) { uint64_t lower_bits = array[next_index--]; - result_array[i] = (next_index < 0) - ? lower_bits - : (static_cast(lower_bits) << 32) | lower_bits; + result_array[i] = + (next_index < 0) + ? lower_bits + : (static_cast(array[next_index--]) << 32) | lower_bits; } *result = Decimal(result_array[1], result_array[0]); @@ -654,7 +655,6 @@ static void AppendLittleEndianArrayToString(const std::array& array size_t num_segments = 0; uint64_t* most_significant_elem = ©[most_significant_elem_idx]; - std::cout << copy[1] << " " << copy[0] << std::endl; do { // Compute remainder = copy % 1e9 and copy = copy / 1e9. uint32_t remainder = 0; @@ -690,6 +690,71 @@ static void AppendLittleEndianArrayToString(const std::array& array out->append(oss.str()); } +static void AdjustIntegerStringWithScale(std::string* str, int32_t scale) { + if (scale == 0) { + return; + } + assert(str != nullptr); + assert(!str->empty()); + const bool is_negative = str->front() == '-'; + const auto is_negative_offset = static_cast(is_negative); + const auto len = static_cast(str->size()); + const int32_t num_digits = len - is_negative_offset; + const int32_t adjusted_exponent = num_digits - 1 - scale; + + // Note that the -6 is taken from the Java BigDecimal documentation. + if (scale < 0 || adjusted_exponent < -6) { + // Example 1: + // Precondition: *str = "123", is_negative_offset = 0, num_digits = 3, scale = -2, + // adjusted_exponent = 4 + // After inserting decimal point: *str = "1.23" + // After appending exponent: *str = "1.23E+4" + // Example 2: + // Precondition: *str = "-123", is_negative_offset = 1, num_digits = 3, scale = 9, + // adjusted_exponent = -7 + // After inserting decimal point: *str = "-1.23" + // After appending exponent: *str = "-1.23E-7" + // Example 3: + // Precondition: *str = "0", is_negative_offset = 0, num_digits = 1, scale = -1, + // adjusted_exponent = 1 + // After inserting decimal point: *str = "0" // Not inserted + // After appending exponent: *str = "0E+1" + if (num_digits > 1) { + str->insert(str->begin() + 1 + is_negative_offset, '.'); + } + str->push_back('E'); + if (adjusted_exponent >= 0) { + str->push_back('+'); + } + // Append the adjusted exponent as a string. + str->append(std::to_string(adjusted_exponent)); + return; + } + + if (num_digits > scale) { + const auto n = static_cast(len - scale); + // Example 1: + // Precondition: *str = "123", len = num_digits = 3, scale = 1, n = 2 + // After inserting decimal point: *str = "12.3" + // Example 2: + // Precondition: *str = "-123", len = 4, num_digits = 3, scale = 1, n = 3 + // After inserting decimal point: *str = "-12.3" + str->insert(str->begin() + n, '.'); + return; + } + + // Example 1: + // Precondition: *str = "123", is_negative_offset = 0, num_digits = 3, scale = 4 + // After insert: *str = "000123" + // After setting decimal point: *str = "0.0123" + // Example 2: + // Precondition: *str = "-123", is_negative_offset = 1, num_digits = 3, scale = 4 + // After insert: *str = "-000123" + // After setting decimal point: *str = "-0.0123" + str->insert(is_negative_offset, scale - num_digits + 2, '0'); + str->at(is_negative_offset + 1) = '.'; +} + } // namespace Result Decimal::ToString(int32_t scale) const { @@ -698,7 +763,9 @@ Result Decimal::ToString(int32_t scale) const { "Decimal::ToString: scale must be in the range [-{}, {}], was {}", kMaxScale, kMaxScale, scale); } - return NotImplemented("Decimal::ToString is not implemented yet"); + std::string str(ToIntegerString()); + AdjustIntegerStringWithScale(&str, scale); + return str; } std::string Decimal::ToIntegerString() const { diff --git a/test/decimal_test.cc b/test/decimal_test.cc index 5b140c4c5..dd28a0c21 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -18,7 +18,11 @@ */ #include "iceberg/expression/decimal.h" +#include +#include + #include +#include #include "gmock/gmock.h" #include "matchers.h" @@ -164,4 +168,251 @@ TEST(DecimalTest, LargeValues) { } } +TEST(DecimalTest, TestStringRoundTrip) { + static constexpr std::array kTestBits = { + 0, + 1, + 999, + 1000, + std::numeric_limits::max(), + (1ull << 31), + std::numeric_limits::max(), + (1ull << 32), + std::numeric_limits::max(), + (1ull << 63), + std::numeric_limits::max(), + }; + static constexpr std::array kScales = {0, 1, 10}; + for (uint64_t high : kTestBits) { + for (uint64_t low : kTestBits) { + Decimal value(high, low); + for (int32_t scale : kScales) { + auto result = value.ToString(scale); + + ASSERT_THAT(result, IsOk()) + << "Failed to convert Decimal to string: " << value.ToIntegerString() + << ", scale: " << scale; + + auto round_trip = Decimal::FromString(result.value()); + ASSERT_THAT(round_trip, IsOk()) + << "Failed to convert string back to Decimal: " << result.value(); + + EXPECT_EQ(value, round_trip.value()) + << "Round trip failed for value: " << value.ToIntegerString() + << ", scale: " << scale; + } + } + } +} + +TEST(DecimalTest, FromStringLimits) { + AssertDecimalFromString("1e37", Decimal(542101086242752217ULL, 68739955140067328ULL), + 38, 0); + + AssertDecimalFromString( + "-1e37", Decimal(17904642987466799398ULL, 18378004118569484288ULL), 38, 0); + AssertDecimalFromString( + "9.87e37", Decimal(5350537721215964381ULL, 15251391175463010304ULL), 38, 0); + AssertDecimalFromString( + "-9.87e37", Decimal(13096206352493587234ULL, 3195352898246541312ULL), 38, 0); + AssertDecimalFromString("12345678901234567890123456789012345678", + Decimal(669260594276348691ULL, 14143994781733811022ULL), 38, 0); + AssertDecimalFromString("-12345678901234567890123456789012345678", + Decimal(17777483479433202924ULL, 4302749291975740594ULL), 38, + 0); + + // "9..9" (38 times) + const auto dec38times9pos = Decimal(5421010862427522170ULL, 687399551400673279ULL); + // "-9..9" (38 times) + const auto dec38times9neg = Decimal(13025733211282029445ULL, 17759344522308878337ULL); + + AssertDecimalFromString("99999999999999999999999999999999999999", dec38times9pos, 38, + 0); + AssertDecimalFromString("-99999999999999999999999999999999999999", dec38times9neg, 38, + 0); + AssertDecimalFromString("9.9999999999999999999999999999999999999e37", dec38times9pos, + 38, 0); + AssertDecimalFromString("-9.9999999999999999999999999999999999999e37", dec38times9neg, + 38, 0); + + // No exponent, many fractional digits + AssertDecimalFromString("9.9999999999999999999999999999999999999", dec38times9pos, 38, + 37); + AssertDecimalFromString("-9.9999999999999999999999999999999999999", dec38times9neg, 38, + 37); + AssertDecimalFromString("0.99999999999999999999999999999999999999", dec38times9pos, 38, + 38); + AssertDecimalFromString("-0.99999999999999999999999999999999999999", dec38times9neg, 38, + 38); + + // Negative exponent + AssertDecimalFromString("1e-38", Decimal(0, 1), 1, 38); + AssertDecimalFromString( + "-1e-38", Decimal(18446744073709551615ULL, 18446744073709551615ULL), 1, 38); + AssertDecimalFromString("9.99e-36", Decimal(0, 999), 3, 38); + AssertDecimalFromString( + "-9.99e-36", Decimal(18446744073709551615ULL, 18446744073709550617ULL), 3, 38); + AssertDecimalFromString("987e-38", Decimal(0, 987), 3, 38); + AssertDecimalFromString( + "-987e-38", Decimal(18446744073709551615ULL, 18446744073709550629ULL), 3, 38); + AssertDecimalFromString("99999999999999999999999999999999999999e-37", dec38times9pos, + 38, 37); + AssertDecimalFromString("-99999999999999999999999999999999999999e-37", dec38times9neg, + 38, 37); + AssertDecimalFromString("99999999999999999999999999999999999999e-38", dec38times9pos, + 38, 38); + AssertDecimalFromString("-99999999999999999999999999999999999999e-38", dec38times9neg, + 38, 38); +} + +TEST(DecimalTest, FromStringInvalid) { + // Empty string + auto result = Decimal::FromString(""); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + ASSERT_THAT(result, HasErrorMessage( + "Decimal::FromString: empty string is not a valid Decimal")); + for (const auto& invalid_string : + std::vector{"-", "0.0.0", "0-13-32", "a", "-23092.235-", + "-+23092.235", "+-23092.235", "00a", "1e1a", "0.00123D/3", + "1.23eA8", "1.23E+3A", "-1.23E--5", "1.2345E+++07"}) { + auto result = Decimal::FromString(invalid_string); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + ASSERT_THAT(result, HasErrorMessage("Decimal::FromString: invalid decimal string")); + } + + for (const auto& invalid_string : + std::vector{"1e39", "-1e39", "9e39", "-9e39", "9.9e40", "-9.9e40"}) { + auto result = Decimal::FromString(invalid_string); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + ASSERT_THAT(result, + HasErrorMessage("Decimal::FromString: scale must be in the range")); + } +} + +TEST(DecimalTest, Division) { + const std::string expected_string_value("-23923094039234029"); + const Decimal value(expected_string_value); + const Decimal result(value / 3); + const Decimal expected_value("-7974364679744676"); + ASSERT_EQ(expected_value, result); +} + +TEST(DecimalTest, ToString) { + struct ToStringCase { + int64_t test_value; + int32_t scale; + const char* expected_string; + }; + + for (const auto& t : std::vector{ + {.test_value = 0, .scale = -1, .expected_string = "0E+1"}, + {.test_value = 0, .scale = 0, .expected_string = "0"}, + {.test_value = 0, .scale = 1, .expected_string = "0.0"}, + {.test_value = 0, .scale = 6, .expected_string = "0.000000"}, + {.test_value = 2, .scale = 7, .expected_string = "2E-7"}, + {.test_value = 2, .scale = -1, .expected_string = "2E+1"}, + {.test_value = 2, .scale = 0, .expected_string = "2"}, + {.test_value = 2, .scale = 1, .expected_string = "0.2"}, + {.test_value = 2, .scale = 6, .expected_string = "0.000002"}, + {.test_value = -2, .scale = 7, .expected_string = "-2E-7"}, + {.test_value = -2, .scale = 7, .expected_string = "-2E-7"}, + {.test_value = -2, .scale = -1, .expected_string = "-2E+1"}, + {.test_value = -2, .scale = 0, .expected_string = "-2"}, + {.test_value = -2, .scale = 1, .expected_string = "-0.2"}, + {.test_value = -2, .scale = 6, .expected_string = "-0.000002"}, + {.test_value = -2, .scale = 7, .expected_string = "-2E-7"}, + {.test_value = 123, .scale = -3, .expected_string = "1.23E+5"}, + {.test_value = 123, .scale = -1, .expected_string = "1.23E+3"}, + {.test_value = 123, .scale = 1, .expected_string = "12.3"}, + {.test_value = 123, .scale = 0, .expected_string = "123"}, + {.test_value = 123, .scale = 5, .expected_string = "0.00123"}, + {.test_value = 123, .scale = 8, .expected_string = "0.00000123"}, + {.test_value = 123, .scale = 9, .expected_string = "1.23E-7"}, + {.test_value = 123, .scale = 10, .expected_string = "1.23E-8"}, + {.test_value = -123, .scale = -3, .expected_string = "-1.23E+5"}, + {.test_value = -123, .scale = -1, .expected_string = "-1.23E+3"}, + {.test_value = -123, .scale = 1, .expected_string = "-12.3"}, + {.test_value = -123, .scale = 0, .expected_string = "-123"}, + {.test_value = -123, .scale = 5, .expected_string = "-0.00123"}, + {.test_value = -123, .scale = 8, .expected_string = "-0.00000123"}, + {.test_value = -123, .scale = 9, .expected_string = "-1.23E-7"}, + {.test_value = -123, .scale = 10, .expected_string = "-1.23E-8"}, + {.test_value = 1000000000, .scale = -3, .expected_string = "1.000000000E+12"}, + {.test_value = 1000000000, .scale = -1, .expected_string = "1.000000000E+10"}, + {.test_value = 1000000000, .scale = 0, .expected_string = "1000000000"}, + {.test_value = 1000000000, .scale = 1, .expected_string = "100000000.0"}, + {.test_value = 1000000000, .scale = 5, .expected_string = "10000.00000"}, + {.test_value = 1000000000, + .scale = 15, + .expected_string = "0.000001000000000"}, + {.test_value = 1000000000, .scale = 16, .expected_string = "1.000000000E-7"}, + {.test_value = 1000000000, .scale = 17, .expected_string = "1.000000000E-8"}, + {.test_value = -1000000000, + .scale = -3, + .expected_string = "-1.000000000E+12"}, + {.test_value = -1000000000, + .scale = -1, + .expected_string = "-1.000000000E+10"}, + {.test_value = -1000000000, .scale = 0, .expected_string = "-1000000000"}, + {.test_value = -1000000000, .scale = 1, .expected_string = "-100000000.0"}, + {.test_value = -1000000000, .scale = 5, .expected_string = "-10000.00000"}, + {.test_value = -1000000000, + .scale = 15, + .expected_string = "-0.000001000000000"}, + {.test_value = -1000000000, .scale = 16, .expected_string = "-1.000000000E-7"}, + {.test_value = -1000000000, .scale = 17, .expected_string = "-1.000000000E-8"}, + {.test_value = 1234567890123456789LL, + .scale = -3, + .expected_string = "1.234567890123456789E+21"}, + {.test_value = 1234567890123456789LL, + .scale = -1, + .expected_string = "1.234567890123456789E+19"}, + {.test_value = 1234567890123456789LL, + .scale = 0, + .expected_string = "1234567890123456789"}, + {.test_value = 1234567890123456789LL, + .scale = 1, + .expected_string = "123456789012345678.9"}, + {.test_value = 1234567890123456789LL, + .scale = 5, + .expected_string = "12345678901234.56789"}, + {.test_value = 1234567890123456789LL, + .scale = 24, + .expected_string = "0.000001234567890123456789"}, + {.test_value = 1234567890123456789LL, + .scale = 25, + .expected_string = "1.234567890123456789E-7"}, + {.test_value = -1234567890123456789LL, + .scale = -3, + .expected_string = "-1.234567890123456789E+21"}, + {.test_value = -1234567890123456789LL, + .scale = -1, + .expected_string = "-1.234567890123456789E+19"}, + {.test_value = -1234567890123456789LL, + .scale = 0, + .expected_string = "-1234567890123456789"}, + {.test_value = -1234567890123456789LL, + .scale = 1, + .expected_string = "-123456789012345678.9"}, + {.test_value = -1234567890123456789LL, + .scale = 5, + .expected_string = "-12345678901234.56789"}, + {.test_value = -1234567890123456789LL, + .scale = 24, + .expected_string = "-0.000001234567890123456789"}, + {.test_value = -1234567890123456789LL, + .scale = 25, + .expected_string = "-1.234567890123456789E-7"}, + }) { + const Decimal value(t.test_value); + auto result = value.ToString(t.scale); + ASSERT_THAT(result, IsOk()) + << "Failed to convert Decimal to string: " << value.ToIntegerString() + << ", scale: " << t.scale; + + EXPECT_EQ(result.value(), t.expected_string) + << "Expected: " << t.expected_string << ", but got: " << result.value(); + } +} + } // namespace iceberg From d93b6503acc00fff4cae5ec409e32e7e5e1407f2 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 21 Aug 2025 00:54:15 +0800 Subject: [PATCH 04/20] feat: add FromReal and ToReal --- src/iceberg/expression/decimal.cc | 504 ++++++++++++++++++++++++++---- src/iceberg/expression/decimal.h | 44 ++- test/decimal_test.cc | 94 ++++++ 3 files changed, 563 insertions(+), 79 deletions(-) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index eadab243a..cd7acaa25 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -146,7 +146,7 @@ static inline int64_t FillInArray(const Decimal& value, uint32_t* array) { /// \brief Shift the number in the array left by bits positions. static inline void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { if (length > 0 && bits > 0) { - for (int i = 0; i < length - 1; ++i) { + for (int32_t i = 0; i < length - 1; ++i) { array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits)); } array[length - 1] <<= bits; @@ -156,7 +156,7 @@ static inline void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) /// \brief Shift the number in the array right by bits positions. static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { if (length > 0 && bits > 0) { - for (int i = length - 1; i > 0; --i) { + for (int32_t i = length - 1; i > 0; --i) { array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits)); } array[0] >>= bits; @@ -333,47 +333,6 @@ static inline Status DecimalDivide(const Decimal& dividend, const Decimal& divis return {}; } -/// \brief Powers of ten for Decimal with scale from 0 to 38. -constexpr std::array kDecimal128PowersOfTen = { - Decimal(1LL), - Decimal(10LL), - Decimal(100LL), - Decimal(1000LL), - Decimal(10000LL), - Decimal(100000LL), - Decimal(1000000LL), - Decimal(10000000LL), - Decimal(100000000LL), - Decimal(1000000000LL), - Decimal(10000000000LL), - Decimal(100000000000LL), - Decimal(1000000000000LL), - Decimal(10000000000000LL), - Decimal(100000000000000LL), - Decimal(1000000000000000LL), - Decimal(10000000000000000LL), - Decimal(100000000000000000LL), - Decimal(1000000000000000000LL), - Decimal(0LL, 10000000000000000000ULL), - Decimal(5LL, 7766279631452241920ULL), - Decimal(54LL, 3875820019684212736ULL), - Decimal(542LL, 1864712049423024128ULL), - Decimal(5421LL, 200376420520689664ULL), - Decimal(54210LL, 2003764205206896640ULL), - Decimal(542101LL, 1590897978359414784ULL), - Decimal(5421010LL, 15908979783594147840ULL), - Decimal(54210108LL, 11515845246265065472ULL), - Decimal(542101086LL, 4477988020393345024ULL), - Decimal(5421010862LL, 7886392056514347008ULL), - Decimal(54210108624LL, 5076944270305263616ULL), - Decimal(542101086242LL, 13875954555633532928ULL), - Decimal(5421010862427LL, 9632337040368467968ULL), - Decimal(54210108624275LL, 4089650035136921600ULL), - Decimal(542101086242752LL, 4003012203950112768ULL), - Decimal(5421010862427522LL, 3136633892082024448ULL), - Decimal(54210108624275221LL, 12919594847110692864ULL), - Decimal(542101086242752217LL, 68739955140067328ULL), - Decimal(5421010862427522170LL, 687399551400673280ULL)}; } // namespace Decimal::Decimal(std::string_view str) { @@ -607,6 +566,48 @@ constexpr std::array kUInt64PowersOfTen = { // clang-format on }; +/// \brief Powers of ten for Decimal with scale from 0 to 38. +constexpr std::array kDecimal128PowersOfTen = { + Decimal(1LL), + Decimal(10LL), + Decimal(100LL), + Decimal(1000LL), + Decimal(10000LL), + Decimal(100000LL), + Decimal(1000000LL), + Decimal(10000000LL), + Decimal(100000000LL), + Decimal(1000000000LL), + Decimal(10000000000LL), + Decimal(100000000000LL), + Decimal(1000000000000LL), + Decimal(10000000000000LL), + Decimal(100000000000000LL), + Decimal(1000000000000000LL), + Decimal(10000000000000000LL), + Decimal(100000000000000000LL), + Decimal(1000000000000000000LL), + Decimal(0LL, 10000000000000000000ULL), + Decimal(5LL, 7766279631452241920ULL), + Decimal(54LL, 3875820019684212736ULL), + Decimal(542LL, 1864712049423024128ULL), + Decimal(5421LL, 200376420520689664ULL), + Decimal(54210LL, 2003764205206896640ULL), + Decimal(542101LL, 1590897978359414784ULL), + Decimal(5421010LL, 15908979783594147840ULL), + Decimal(54210108LL, 11515845246265065472ULL), + Decimal(542101086LL, 4477988020393345024ULL), + Decimal(5421010862LL, 7886392056514347008ULL), + Decimal(54210108624LL, 5076944270305263616ULL), + Decimal(542101086242LL, 13875954555633532928ULL), + Decimal(5421010862427LL, 9632337040368467968ULL), + Decimal(54210108624275LL, 4089650035136921600ULL), + Decimal(542101086242752LL, 4003012203950112768ULL), + Decimal(5421010862427522LL, 3136633892082024448ULL), + Decimal(54210108624275221LL, 12919594847110692864ULL), + Decimal(542101086242752217LL, 68739955140067328ULL), + Decimal(5421010862427522170LL, 687399551400673280ULL)}; + static inline void ShiftAndAdd(std::string_view input, std::array& out) { for (size_t pos = 0; pos < input.size();) { const size_t group_size = std::min(kInt64DecimalDigits, input.size() - pos); @@ -843,23 +844,392 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, return result; } -template - requires std::is_floating_point_v -Result Decimal::FromReal(T value, int32_t precision, int32_t scale) { - if (precision < 1 || precision > kMaxPrecision) { - return InvalidArgument( - "Decimal::FromReal: precision must be in the range [1, {}], " - "was {}", - kMaxPrecision, precision); +namespace { + +constexpr float kFloatInf = std::numeric_limits::infinity(); + +// Attention: these pre-computed constants might not exactly represent their +// decimal counterparts: +// >>> int32_t(1e38) +// 99999999999999997748809823456034029568 + +constexpr int32_t kPrecomputedPowersOfTen = 76; + +constexpr std::array kFloatPowersOfTen = { + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1e-45f, 1e-44f, 1e-43f, 1e-42f, + 1e-41f, 1e-40f, 1e-39f, 1e-38f, 1e-37f, 1e-36f, 1e-35f, + 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f, 1e-28f, + 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f, + 1e-20f, 1e-19f, 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f, + 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f, 1e-8f, 1e-7f, + 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f, + 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, + 1e8f, 1e9f, 1e10f, 1e11f, 1e12f, 1e13f, 1e14f, + 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f, + 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f, + 1e29f, 1e30f, 1e31f, 1e32f, 1e33f, 1e34f, 1e35f, + 1e36f, 1e37f, 1e38f, kFloatInf, kFloatInf, kFloatInf, kFloatInf, + kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, + kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, + kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, + kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, + kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf}; + +constexpr std::array kDoublePowersOfTen = { + 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65, + 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53, + 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, + 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, + 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, + 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, + 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, + 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, + 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, + 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43, + 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, + 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, + 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76}; + +// ceil(log2(10 ^ k)) for k in [0...76] +constexpr std::array kCeilLog2PowersOfTen = { + 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40, 44, 47, 50, + 54, 57, 60, 64, 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, 100, 103, + 107, 110, 113, 117, 120, 123, 127, 130, 133, 137, 140, 143, 147, 150, 153, 157, + 160, 163, 167, 170, 173, 177, 180, 183, 187, 190, 193, 196, 200, 203, 206, 210, + 213, 216, 220, 223, 226, 230, 233, 236, 240, 243, 246, 250, 253}; + +template +struct RealTraits {}; + +template <> +struct RealTraits { + static constexpr const float* powers_of_ten() { return kFloatPowersOfTen.data(); } + + static constexpr float two_to_64(float x) { return x * 1.8446744e+19f; } + static constexpr float two_to_128(float x) { return x == 0 ? 0 : kFloatInf; } + static constexpr float two_to_192(float x) { return x == 0 ? 0 : kFloatInf; } + + static constexpr int32_t kMantissaBits = 24; + // ceil(log10(2 ^ kMantissaBits)) + static constexpr int32_t kMantissaDigits = 8; + // Integers between zero and kMaxPreciseInteger can be precisely represented + static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; +}; + +template <> +struct RealTraits { + static constexpr const double* powers_of_ten() { return kDoublePowersOfTen.data(); } + + static constexpr double two_to_64(double x) { return x * 1.8446744073709552e+19; } + static constexpr double two_to_128(double x) { return x * 3.402823669209385e+38; } + static constexpr double two_to_192(double x) { return x * 6.277101735386681e+57; } + + static constexpr int32_t kMantissaBits = 53; + // ceil(log10(2 ^ kMantissaBits)) + static constexpr int32_t kMantissaDigits = 16; + // Integers between zero and kMaxPreciseInteger can be precisely represented + static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; +}; + +struct DecimalRealConversion { + // Return 10**exp, with a fast lookup, assuming `exp` is within bounds + template + static Real PowerOfTen(int32_t exp) { + constexpr int32_t N = kPrecomputedPowersOfTen; + assert(exp >= -N && exp <= N); + return RealTraits::powers_of_ten()[N + exp]; } - if (scale > precision) { - return InvalidArgument( - "Decimal::FromReal: scale must be less than or equal to " - "precision, was scale = {}, precision = {}", - scale, precision); + + // Return 10**exp, with a fast lookup if possible + template + static Real LargePowerOfTen(int32_t exp) { + constexpr int32_t N = kPrecomputedPowersOfTen; + if (exp >= -N && exp <= N) { + return RealTraits::powers_of_ten()[N + exp]; + } else { + return std::pow(static_cast(10.0), static_cast(exp)); + } + } + + static constexpr int32_t kMaxPrecision = Decimal::kMaxPrecision; + static constexpr int32_t kMaxScale = Decimal::kMaxScale; + + static constexpr auto& DecimalPowerOfTen(int32_t exp) { + assert(exp >= 0 && exp <= kMaxPrecision); + return kDecimal128PowersOfTen[exp]; + } + + // Right shift positive `x` by positive `bits`, rounded half to even + static Decimal RoundedRightShift(const Decimal& x, int32_t bits) { + if (bits == 0) { + return x; + } + int64_t result_hi = x.high(); + uint64_t result_lo = x.low(); + uint64_t shifted = 0; + while (bits >= 64) { + // Retain the information that set bits were shifted right. + // This is important to detect an exact half. + shifted = result_lo | (shifted > 0); + result_lo = result_hi; + result_hi >>= 63; // for sign + bits -= 64; + } + if (bits > 0) { + shifted = (result_lo << (64 - bits)) | (shifted > 0); + result_lo >>= bits; + result_lo |= static_cast(result_hi) << (64 - bits); + result_hi >>= bits; + } + // We almost have our result, but now do the rounding. + constexpr uint64_t kHalf = 0x8000000000000000ULL; + if (shifted > kHalf) { + // Strictly more than half => round up + result_lo += 1; + result_hi += (result_lo == 0); + } else if (shifted == kHalf) { + // Exactly half => round to even + if ((result_lo & 1) != 0) { + result_lo += 1; + result_hi += (result_lo == 0); + } + } else { + // Strictly less than half => round down + } + return Decimal{result_hi, result_lo}; + } + + template + static Result FromPositiveApprox(Real real, int32_t precision, int32_t scale) { + // Approximate algorithm that operates in the FP domain (thus subject + // to precision loss). + const auto x = std::nearbyint(real * PowerOfTen(scale)); + const auto max_abs = PowerOfTen(precision); + if (x <= -max_abs || x >= max_abs) { + return Overflow( + "Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", real, + precision, scale); + } + + // Extract high and low bits + const auto high = std::floor(std::ldexp(x, -64)); + const auto low = x - std::ldexp(high, 64); + + assert(high >= 0); + assert(high < 9.223372036854776e+18); // 2**63 + assert(low >= 0); + assert(low < 1.8446744073709552e+19); // 2**64 + return Decimal(static_cast(high), static_cast(low)); + } + + template + static Result FromPositiveReal(Real real, int32_t precision, int32_t scale) { + constexpr int32_t kMantissaBits = RealTraits::kMantissaBits; + constexpr int32_t kMantissaDigits = RealTraits::kMantissaDigits; + + // Problem statement: construct the Decimal with the value + // closest to `real * 10^scale`. + if (scale < 0) { + // Negative scales are not handled below, fall back to approx algorithm + return FromPositiveApprox(real, precision, scale); + } + + // 1. Check that `real` is within acceptable bounds. + const Real limit = PowerOfTen(precision - scale); + if (real > limit) { + // Checking the limit early helps ensure the computations below do not + // overflow. + // NOTE: `limit` is allowed here as rounding can make it smaller than + // the theoretical limit (for example, 1.0e23 < 10^23). + return Overflow( + "Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", real, + precision, scale); + } + + // 2. Losslessly convert `real` to `mant * 2**k` + int32_t binary_exp = 0; + const Real real_mant = std::frexp(real, &binary_exp); + // `real_mant` is within 0.5 and 1 and has M bits of precision. + // Multiply it by 2^M to get an exact integer. + const auto mant = static_cast(std::ldexp(real_mant, kMantissaBits)); + const int32_t k = binary_exp - kMantissaBits; + // (note that `real = mant * 2^k`) + + // 3. Start with `mant`. + // We want to end up with `real * 10^scale` i.e. `mant * 2^k * 10^scale`. + Decimal x(mant); + + if (k < 0) { + // k < 0 (i.e. binary_exp < kMantissaBits), is probably the common case + // when converting to decimal. It implies right-shifting by -k bits, + // while multiplying by 10^scale. We also must avoid overflow (losing + // bits on the left) and precision loss (losing bits on the right). + int32_t right_shift_by = -k; + int32_t mul_by_ten_to = scale; + + // At this point, `x` has kMantissaDigits significant digits but it can + // fit kMaxPrecision (excluding sign). We can therefore multiply by up + // to 10^(kMaxPrecision - kMantissaDigits). + constexpr int32_t kSafeMulByTenTo = kMaxPrecision - kMantissaDigits; + + if (mul_by_ten_to <= kSafeMulByTenTo) { + // Scale is small enough, so we can do it all at once. + x *= DecimalPowerOfTen(mul_by_ten_to); + x = RoundedRightShift(x, right_shift_by); + } else { + // Scale is too large, we cannot multiply at once without overflow. + // We use an iterative algorithm which alternately shifts left by + // multiplying by a power of ten, and shifts right by a number of bits. + + // First multiply `x` by as large a power of ten as possible + // without overflowing. + x *= DecimalPowerOfTen(kSafeMulByTenTo); + mul_by_ten_to -= kSafeMulByTenTo; + + // `x` now has full precision. However, we know we'll only + // keep `precision` digits at the end. Extraneous bits/digits + // on the right can be safely shifted away, before multiplying + // again. + // NOTE: if `precision` is the full precision then the algorithm will + // lose the last digit. If `precision` is almost the full precision, + // there can be an off-by-one error due to rounding. + const int32_t mul_step = std::max(1, kMaxPrecision - precision); + + // The running exponent, useful to compute by how much we must + // shift right to make place on the left before the next multiply. + int32_t total_exp = 0; + int32_t total_shift = 0; + while (mul_by_ten_to > 0 && right_shift_by > 0) { + const int32_t exp = std::min(mul_by_ten_to, mul_step); + total_exp += exp; + // The supplementary right shift required so that + // `x * 10^total_exp / 2^total_shift` fits in the decimal. + assert(static_cast(total_exp) < sizeof(kCeilLog2PowersOfTen)); + const int32_t bits = + std::min(right_shift_by, kCeilLog2PowersOfTen[total_exp] - total_shift); + total_shift += bits; + // Right shift to make place on the left, then multiply + x = RoundedRightShift(x, bits); + right_shift_by -= bits; + // Should not overflow thanks to the precautions taken + x *= DecimalPowerOfTen(exp); + mul_by_ten_to -= exp; + } + if (mul_by_ten_to > 0) { + x *= DecimalPowerOfTen(mul_by_ten_to); + } + if (right_shift_by > 0) { + x = RoundedRightShift(x, right_shift_by); + } + } + } else { + // k >= 0 implies left-shifting by k bits and multiplying by 10^scale. + // The order of these operations therefore doesn't matter. We know + // we won't overflow because of the limit check above, and we also + // won't lose any significant bits on the right. + x *= DecimalPowerOfTen(scale); + x <<= k; + } + + // Rounding might have pushed `x` just above the max precision, check again + if (!x.FitsInPrecision(precision)) { + return Overflow( + "Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", real, + precision, scale); + } + return x; + } + + template + static Real ToRealPositiveNoSplit(const Decimal& decimal, int32_t scale) { + Real x = RealTraits::two_to_64(static_cast(decimal.high())); + x += static_cast(decimal.low()); + x *= LargePowerOfTen(-scale); + return x; + } + + /// An approximate conversion from Decimal128 to Real that guarantees: + /// 1. If the decimal is an integer, the conversion is exact. + /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. + /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact + /// value. + /// 3. Otherwise, the conversion is within 2^(-RealTraits::kMantissaDigits+1) + /// (e.g. 2^-23 for float and 2^-52 for double) of the exact value. + /// Here "exact value" means the closest representable value by Real. + template + static Real ToRealPositive(const Decimal& decimal, int32_t scale) { + if (scale <= 0 || + (decimal.high() == 0 && decimal.low() <= RealTraits::kMaxPreciseInteger)) { + // No need to split the decimal if it is already an integer (scale <= 0) or if it + // can be precisely represented by Real + return ToRealPositiveNoSplit(decimal, scale); + } + + // Split decimal into whole and fractional parts to avoid precision loss + auto s = decimal.GetWholeAndFraction(scale); + assert(s); + + Real whole = ToRealPositiveNoSplit(s->first, 0); + Real fraction = ToRealPositiveNoSplit(s->second, scale); + + return whole + fraction; + } + + template + static Result FromReal(Real value, int32_t precision, int32_t scale) { + assert(precision >= 1 && precision <= kMaxPrecision); + assert(scale >= -kMaxScale && scale <= kMaxScale); + + if (!std::isfinite(value)) { + return InvalidArgument("Cannot convert {} to Decimal", value); + } + + if (value == 0) { + return Decimal{}; + } + + if (value < 0) { + ICEBERG_ASSIGN_OR_RAISE(auto decimal, FromPositiveReal(-value, precision, scale)); + return decimal.Negate(); + } else { + return FromPositiveReal(value, precision, scale); + } + } + + template + static Real ToReal(const Decimal& decimal, int32_t scale) { + assert(scale >= -kMaxScale && scale <= kMaxScale); + + if (decimal.IsNegative()) { + // Convert the absolute value to avoid precision loss + auto abs = decimal; + abs.Negate(); + return -ToRealPositive(abs, scale); + } else { + return ToRealPositive(decimal, scale); + } } +}; + +} // namespace - return NotImplemented("Decimal::FromReal is not implemented yet"); +Result Decimal::FromReal(float x, int32_t precision, int32_t scale) { + return DecimalRealConversion::FromReal(x, precision, scale); +} + +Result Decimal::FromReal(double x, int32_t precision, int32_t scale) { + return DecimalRealConversion::FromReal(x, precision, scale); +} + +Result> Decimal::GetWholeAndFraction(int32_t scale) const { + assert(scale >= 0 && scale <= kMaxScale); + + Decimal multiplier(kDecimal128PowersOfTen[scale]); + return Divide(multiplier); } Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { @@ -883,17 +1253,17 @@ Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { return NotImplemented("Decimal::Rescale is not implemented yet"); } -template - requires std::is_floating_point_v -T Decimal::ToReal(int32_t scale) const { - if (scale < -kMaxScale || scale > kMaxScale) { - throw InvalidArgument("Decimal::ToReal: scale must be in the range [-{}, {}], was {}", - kMaxScale, kMaxScale, scale); - } +bool Decimal::FitsInPrecision(int32_t precision) const { + assert(precision >= 1 && precision <= kMaxPrecision); + return Decimal::Abs(*this) < kDecimal128PowersOfTen[precision]; +} + +float Decimal::ToFloat(int32_t scale) const { + return DecimalRealConversion::ToReal(*this, scale); +} - // Convert Decimal to floating-point value with the given scale. - // This is a placeholder implementation. - return T(0); // Replace with actual conversion logic. +double Decimal::ToDouble(int32_t scale) const { + return DecimalRealConversion::ToReal(*this, scale); } // Unary operators diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/expression/decimal.h index c4672bbde..7591fd6db 100644 --- a/src/iceberg/expression/decimal.h +++ b/src/iceberg/expression/decimal.h @@ -36,10 +36,10 @@ namespace iceberg { /// 38 significant digits. class ICEBERG_EXPORT Decimal { public: - static constexpr int kBitWidth = 128; - static constexpr int kByteWidth = kBitWidth / 8; - static constexpr int kMaxPrecision = 38; - static constexpr int kMaxScale = 38; + static constexpr int32_t kBitWidth = 128; + static constexpr int32_t kByteWidth = kBitWidth / 8; + static constexpr int32_t kMaxPrecision = 38; + static constexpr int32_t kMaxScale = 38; /// \brief Default constructor initializes to zero. constexpr Decimal() noexcept : data_{0, 0} {} @@ -152,19 +152,37 @@ class ICEBERG_EXPORT Decimal { /// \brief Convert the floating-point value to a Decimal value with the given /// precision and scale. - template - requires std::is_floating_point_v - static Result FromReal(T value, int32_t precision, int32_t scale); + static Result FromReal(double real, int32_t precision, int32_t scale); + static Result FromReal(float real, int32_t precision, int32_t scale); + + /// \brief separate the integer and fractional parts for the given scale. + Result> GetWholeAndFraction(int32_t scale) const; /// \brief Convert Decimal from one scale to another. Result Rescale(int32_t orig_scale, int32_t new_scale) const; + /// \brief Whether this number fits in the given precision + /// + /// Return true if the number of significant digits is less or equal to `precision`. + bool FitsInPrecision(int32_t precision) const; + + /// \brief Convert to a floating-point number (scaled) + float ToFloat(int32_t scale) const; + /// \brief Convert to a floating-point number (scaled) + double ToDouble(int32_t scale) const; + /// \brief Convert the Decimal value to a floating-point value with the given scale. /// \param scale The scale to use for the conversion. /// \return The floating-point value. template requires std::is_floating_point_v - T ToReal(int32_t scale = 0) const; + T ToReal(int32_t scale) const { + if constexpr (std::is_same_v) { + return ToFloat(scale); + } else { + return ToDouble(scale); + } + } /// \brief Returns 1 if positive or zero, -1 if strictly negative. int64_t Sign() const { return 1 | (static_cast(data_[kHighIndex]) >> 63); } @@ -172,6 +190,8 @@ class ICEBERG_EXPORT Decimal { /// \brief Check if the Decimal value is negative. bool IsNegative() const { return static_cast(data_[kHighIndex]) < 0; } + explicit operator bool() const { return data_ != std::array{0, 0}; } + friend bool operator==(const Decimal& lhs, const Decimal& rhs) { return lhs.data_ == rhs.data_; } @@ -182,11 +202,11 @@ class ICEBERG_EXPORT Decimal { private: #if ICEBERG_LITTLE_ENDIAN - static constexpr int kHighIndex = 1; - static constexpr int kLowIndex = 0; + static constexpr int32_t kHighIndex = 1; + static constexpr int32_t kLowIndex = 0; #else - static constexpr int kHighIndex = 0; - static constexpr int kLowIndex = 1; + static constexpr int32_t kHighIndex = 0; + static constexpr int32_t kLowIndex = 1; #endif std::array data_; diff --git a/test/decimal_test.cc b/test/decimal_test.cc index dd28a0c21..cc641db4e 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -415,4 +415,98 @@ TEST(DecimalTest, ToString) { } } +template +struct FromRealTestParam { + Real real; + int32_t precision; + int32_t scale; + const char* expected_string; +}; + +template +void CheckDecimalFromReal(Real real, int32_t precision, int32_t scale, + const char* expected_string) { + auto result = Decimal::FromReal(real, precision, scale); + ASSERT_THAT(result, IsOk()) << "Failed to convert real to Decimal: " << real + << ", precision: " << precision << ", scale: " << scale; + const Decimal& decimal = result.value(); + EXPECT_EQ(decimal.ToString(scale).value(), expected_string); + const std::string expected_neg = + (decimal) ? "-" + std::string(expected_string) : expected_string; + auto neg_result = Decimal::FromReal(-real, precision, scale); + ASSERT_THAT(neg_result, IsOk()) + << "Failed to convert negative real to Decimal: " << -real + << ", precision: " << precision << ", scale: " << scale; + const Decimal& neg_decimal = neg_result.value(); + EXPECT_EQ(neg_decimal.ToString(scale).value(), expected_neg); +} + +using FromFloatTestParam = FromRealTestParam; +using FromDoubleTestParam = FromRealTestParam; + +template +class TestDecimalFromReal : public ::testing::Test { + public: + using ParamType = FromRealTestParam; + + void TestSuccess() { + const std::vector params{ + // clang-format off + {0.0f, 1, 0, "0"}, + {0.0f, 19, 4, "0.0000"}, + {123.0f, 7, 4, "123.0000"}, + {456.78f, 7, 4, "456.7800"}, + {456.784f, 5, 2, "456.78"}, + {456.786f, 5, 2, "456.79"}, + {999.99f, 5, 2, "999.99"}, + {123.0f, 19, 0, "123"}, + {123.4f, 19, 0, "123"}, + {123.6f, 19, 0, "124"}, + // 2**62 + {4.6116860184273879e+18, 19, 0, "4611686018427387904"}, + // 2**63 + {9.2233720368547758e+18, 19, 0, "9223372036854775808"}, + // 2**64 + {1.8446744073709552e+19, 20, 0, "18446744073709551616"}, + // clang-format on + }; + + for (const ParamType& param : params) { + CheckDecimalFromReal(param.real, param.precision, param.scale, + param.expected_string); + } + } + + void TestErrors() { + const std::vector params{ + {std::numeric_limits::infinity(), Decimal::kMaxPrecision / 2, 4, ""}, + {-std::numeric_limits::infinity(), Decimal::kMaxPrecision / 2, 4, ""}, + {std::numeric_limits::quiet_NaN(), Decimal::kMaxPrecision / 2, 4, ""}, + {-std::numeric_limits::quiet_NaN(), Decimal::kMaxPrecision / 2, 4, ""}, + }; + + for (const ParamType& param : params) { + auto result = Decimal::FromReal(param.real, param.precision, param.scale); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + } + + const std::vector overflow_params{ + // Overflow errors + {1000.0, 3, 0, ""}, {-1000.0, 3, 0, ""}, {1000.0, 5, 2, ""}, + {-1000.0, 5, 2, ""}, {999.996, 5, 2, ""}, {-999.996, 5, 2, ""}, + }; + for (const ParamType& param : overflow_params) { + auto result = Decimal::FromReal(param.real, param.precision, param.scale); + ASSERT_THAT(result, IsError(ErrorKind::kOverflow)); + } + } +}; + +using RealTypes = ::testing::Types; +TYPED_TEST_SUITE(TestDecimalFromReal, RealTypes); + +TYPED_TEST(TestDecimalFromReal, TestSuccess) { this->TestSuccess(); } + +TYPED_TEST(TestDecimalFromReal, TestErrors) { this->TestErrors(); } + } // namespace iceberg From 640f8c2b9f9f25f64ecc461581cfb9f04b439913 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 21 Aug 2025 22:13:13 +0800 Subject: [PATCH 05/20] feat: add more ToReal tests --- src/iceberg/expression/decimal.cc | 1 + test/decimal_test.cc | 390 ++++++++++++++++++++++++++++++ 2 files changed, 391 insertions(+) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index cd7acaa25..b48b08143 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/test/decimal_test.cc b/test/decimal_test.cc index cc641db4e..213c5fbf4 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -19,12 +19,15 @@ #include "iceberg/expression/decimal.h" #include +#include #include +#include #include #include #include "gmock/gmock.h" +#include "gtest/gtest.h" #include "matchers.h" namespace iceberg { @@ -441,6 +444,24 @@ void CheckDecimalFromReal(Real real, int32_t precision, int32_t scale, EXPECT_EQ(neg_decimal.ToString(scale).value(), expected_neg); } +template +void CheckDecimalFromRealIntegerString(Real real, int32_t precision, int32_t scale, + const char* expected_string) { + auto result = Decimal::FromReal(real, precision, scale); + ASSERT_THAT(result, IsOk()) << "Failed to convert real to Decimal: " << real + << ", precision: " << precision << ", scale: " << scale; + const Decimal& decimal = result.value(); + EXPECT_EQ(decimal.ToIntegerString(), expected_string); + const std::string expected_neg = + (decimal) ? "-" + std::string(expected_string) : expected_string; + auto neg_result = Decimal::FromReal(-real, precision, scale); + ASSERT_THAT(neg_result, IsOk()) + << "Failed to convert negative real to Decimal: " << -real + << ", precision: " << precision << ", scale: " << scale; + const Decimal& neg_decimal = neg_result.value(); + EXPECT_EQ(neg_decimal.ToIntegerString(), expected_neg); +} + using FromFloatTestParam = FromRealTestParam; using FromDoubleTestParam = FromRealTestParam; @@ -509,4 +530,373 @@ TYPED_TEST(TestDecimalFromReal, TestSuccess) { this->TestSuccess(); } TYPED_TEST(TestDecimalFromReal, TestErrors) { this->TestErrors(); } +TEST(TestDecimalFromReal, FromFloat) { + const std::vector params{ + // -- Stress the 24 bits of precision of a float + // 2**63 + 2**40 + FromFloatTestParam{9.223373e+18f, 19, 0, "9223373136366403584"}, + // 2**64 - 2**40 + FromFloatTestParam{1.8446743e+19f, 20, 0, "18446742974197923840"}, + // 2**64 + 2**41 + FromFloatTestParam{1.8446746e+19f, 20, 0, "18446746272732807168"}, + // 2**14 - 2**-10 + FromFloatTestParam{16383.999f, 8, 3, "16383.999"}, + FromFloatTestParam{16383.999f, 19, 3, "16383.999"}, + // 1 - 2**-24 + FromFloatTestParam{0.99999994f, 10, 10, "0.9999999404"}, + FromFloatTestParam{0.99999994f, 15, 15, "0.999999940395355"}, + FromFloatTestParam{0.99999994f, 20, 20, "0.99999994039535522461"}, + FromFloatTestParam{0.99999994f, 21, 21, "0.999999940395355224609"}, + FromFloatTestParam{0.99999994f, 38, 38, "0.99999994039535522460937500000000000000"}, + // -- Other cases + // 10**38 - 2**103 + FromFloatTestParam{9.999999e+37f, 38, 0, "99999986661652122824821048795547566080"}, + }; + + for (const auto& param : params) { + CheckDecimalFromReal(param.real, param.precision, param.scale, param.expected_string); + } +} + +TEST(TestDecimalFromReal, FromFloatLargeValues) { + for (int32_t scale = -38; scale <= 38; ++scale) { + float real = std::pow(10.0f, static_cast(scale)); + CheckDecimalFromRealIntegerString(real, 1, -scale, "1"); + } + + for (int32_t scale = -37; scale <= 36; ++scale) { + float real = 123.f * std::pow(10.f, static_cast(scale)); + CheckDecimalFromRealIntegerString(real, 2, -scale - 1, "12"); + CheckDecimalFromRealIntegerString(real, 3, -scale, "123"); + CheckDecimalFromRealIntegerString(real, 4, -scale + 1, "1230"); + } +} + +TEST(TestDecimalFromReal, FromDouble) { + const std::vector params{ + // -- Stress the 53 bits of precision of a double + // 2**63 + 2**11 + FromDoubleTestParam{9.223372036854778e+18, 19, 0, "9223372036854777856"}, + // 2**64 - 2**11 + FromDoubleTestParam{1.844674407370955e+19, 20, 0, "18446744073709549568"}, + // 2**64 + 2**11 + FromDoubleTestParam{1.8446744073709556e+19, 20, 0, "18446744073709555712"}, + // 2**126 + FromDoubleTestParam{8.507059173023462e+37, 38, 0, + "85070591730234615865843651857942052864"}, + // 2**126 - 2**74 + FromDoubleTestParam{8.50705917302346e+37, 38, 0, + "85070591730234596976377720379361198080"}, + // 2**36 - 2**-16 + FromDoubleTestParam{68719476735.999985, 11, 0, "68719476736"}, + FromDoubleTestParam{68719476735.999985, 38, 27, + "68719476735.999984741210937500000000000"}, + // -- Other cases + // Almost 10**38 (minus 2**73) + FromDoubleTestParam{9.999999999999998e+37, 38, 0, + "99999999999999978859343891977453174784"}, + FromDoubleTestParam{9.999999999999998e+27, 38, 10, + "9999999999999997384096481280.0000000000"}, + // 10**N (sometimes fits in N digits) + FromDoubleTestParam{1e23, 23, 0, "99999999999999991611392"}, + FromDoubleTestParam{1e23, 24, 1, "99999999999999991611392.0"}, + FromDoubleTestParam{1e36, 37, 0, "1000000000000000042420637374017961984"}, + FromDoubleTestParam{1e36, 38, 1, "1000000000000000042420637374017961984.0"}, + FromDoubleTestParam{1e37, 37, 0, "9999999999999999538762658202121142272"}, + FromDoubleTestParam{1e37, 38, 1, "9999999999999999538762658202121142272.0"}, + FromDoubleTestParam{1e38, 38, 0, "99999999999999997748809823456034029568"}, + // Hand-picked test cases that can involve precision issues. + // More comprehensive testing is done in the PyArrow test suite. + FromDoubleTestParam{9.223372036854778e+10, 19, 8, "92233720368.54777527"}, + FromDoubleTestParam{1.8446744073709556e+15, 20, 4, "1844674407370955.5000"}, + FromDoubleTestParam{999999999999999.0, 16, 1, "999999999999999.0"}, + FromDoubleTestParam{9999999999999998.0, 17, 1, "9999999999999998.0"}, + FromDoubleTestParam{999999999999999.9, 16, 1, "999999999999999.9"}, + FromDoubleTestParam{9999999987., 38, 22, "9999999987.0000000000000000000000"}, + FromDoubleTestParam{9999999987., 38, 28, "9999999987.0000000000000000000000000000"}, + // 1 - 2**-52 + // XXX the result should be 0.99999999999999977795539507496869191527 + // but our algorithm loses the right digit. + FromDoubleTestParam{0.9999999999999998, 38, 38, + "0.99999999999999977795539507496869191520"}, + FromDoubleTestParam{0.9999999999999998, 20, 20, "0.99999999999999977796"}, + FromDoubleTestParam{0.9999999999999998, 16, 16, "0.9999999999999998"}, + }; + + for (const auto& param : params) { + CheckDecimalFromReal(param.real, param.precision, param.scale, param.expected_string); + } +} + +TEST(TestDecimalFromReal, FromDoubleLargeValues) { + constexpr auto kMaxScale = Decimal::kMaxScale; + for (int32_t scale = -kMaxScale; scale <= kMaxScale; ++scale) { + if (std::abs(1 - scale) < kMaxScale) { + double real = std::pow(10.0, static_cast(scale)); + CheckDecimalFromRealIntegerString(real, 1, -scale, "1"); + } + } + + for (int32_t scale = -kMaxScale + 1; scale <= kMaxScale - 1; ++scale) { + if (std::abs(4 - scale) < kMaxScale) { + double real = 123. * std::pow(10.0, static_cast(scale)); + CheckDecimalFromRealIntegerString(real, 2, -scale - 1, "12"); + CheckDecimalFromRealIntegerString(real, 3, -scale, "123"); + CheckDecimalFromRealIntegerString(real, 4, -scale + 1, "1230"); + } + } +} + +template +struct ToRealTestParam { + std::string decimal_value; + int32_t scale; + Real expected; +}; + +using ToFloatTestParam = ToRealTestParam; +using ToDoubleTestParam = ToRealTestParam; + +template +void CheckDecimalToReal(const std::string& decimal_value, int32_t scale, Real expected) { + auto result = Decimal::FromString(decimal_value); + ASSERT_THAT(result, IsOk()) << "Failed to convert string to Decimal: " << decimal_value; + + const Decimal& decimal = result.value(); + auto real_result = decimal.ToReal(scale); + + EXPECT_EQ(real_result, expected) + << "Expected: " << expected << ", but got: " << real_result; +} + +template +void CheckDecimalToRealWithinOneULP(const std::string& decimal_value, int32_t scale, + Real expected) { + Decimal dec(decimal_value); + Real actual = dec.template ToReal(scale); + ASSERT_TRUE(actual == expected || actual == std::nextafter(expected, expected + 1) || + actual == std::nextafter(expected, expected - 1)) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; +} + +template +void CheckDecimalToRealWithinEpsilon(const std::string& decimal_value, int32_t scale, + Real epsilon, Real expected) { + Decimal dec(decimal_value); + Real actual = dec.template ToReal(scale); + ASSERT_LE(std::abs(actual - expected), epsilon) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; +} + +void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, + float expected) { + Decimal dec(decimal_value); + auto actual = dec.template ToReal(scale); + ASSERT_FLOAT_EQ(actual, expected) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; +} + +void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, + double expected) { + Decimal dec(decimal_value); + auto actual = dec.template ToReal(scale); + ASSERT_DOUBLE_EQ(actual, expected) + << "Decimal value: " << decimal_value << ", scale: " << scale + << ", expected: " << expected << ", actual: " << actual; +} + +template +class TestDecimalToReal : public ::testing::Test { + public: + using ParamType = ToRealTestParam; + + Real Pow2(int exp) { return std::pow(static_cast(2), static_cast(exp)); } + + Real Pow10(int exp) { return std::pow(static_cast(10), static_cast(exp)); } + + void TestSuccess() { + const std::vector params{ + // clang-format off + {"0", 0, 0.0f}, + {"0", 10, 0.0f}, + {"0", -10, 0.0f}, + {"1", 0, 1.0f}, + {"12345", 0, 12345.f}, +#ifndef __MINGW32__ // MinGW has precision issues + {"12345", 1, 1234.5f}, +#endif + {"12345", -3, 12345000.f}, + // 2**62 + {"4611686018427387904", 0, Pow2(62)}, + // 2**63 + 2**62 + {"13835058055282163712", 0, Pow2(63) + Pow2(62)}, + // 2**64 + 2**62 + {"23058430092136939520", 0, Pow2(64) + Pow2(62)}, + // 10**38 - 2**103 +#ifndef __MINGW32__ // MinGW has precision issues + {"99999989858795198174164788026374356992", 0, Pow10(38) - Pow2(103)}, +#endif + // clang-format on + }; + + for (const ParamType& param : params) { + CheckDecimalToReal(param.decimal_value, param.scale, param.expected); + if (param.decimal_value != "0") { + // Check negative values as well + CheckDecimalToReal("-" + param.decimal_value, param.scale, -param.expected); + } + } + } +}; + +TYPED_TEST_SUITE(TestDecimalToReal, RealTypes); + +TYPED_TEST(TestDecimalToReal, TestSuccess) { this->TestSuccess(); } + +// Custom test for Decimal::ToReal +TEST(TestDecimalToReal, ToFloatLargeValues) { + constexpr auto max_scale = Decimal::kMaxScale; + + // Note that exact comparisons would succeed on some platforms (Linux, macOS). + // Nevertheless, power-of-ten factors are not all exactly representable + // in binary floating point. + for (int32_t scale = -max_scale; scale <= max_scale; scale++) { +#ifdef _WIN32 + // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero + if (scale == 45) continue; +#endif + CheckDecimalToRealApprox( + "1", scale, std::pow(static_cast(10), static_cast(-scale))); + } + for (int32_t scale = -max_scale; scale <= max_scale - 2; scale++) { +#ifdef _WIN32 + // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero + if (scale == 45) continue; +#endif + const auto factor = static_cast(123); + CheckDecimalToRealApprox( + "123", scale, + factor * std::pow(static_cast(10), static_cast(-scale))); + } +} + +TEST(TestDecimalToReal, ToFloatPrecision) { + // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision) + CheckDecimalToReal("9223373136366403584", 0, 9.223373e+18f); + CheckDecimalToReal("-9223373136366403584", 0, -9.223373e+18f); + + // 2**64 + 2**41 (exactly representable in a float) + CheckDecimalToReal("18446746272732807168", 0, 1.8446746e+19f); + CheckDecimalToReal("-18446746272732807168", 0, -1.8446746e+19f); + + // Integers are always exact + auto scale = Decimal::kMaxScale - 1; + std::string seven = "7."; + seven.append(scale, '0'); // pad with trailing zeros + CheckDecimalToReal(seven, scale, 7.0f); + CheckDecimalToReal("-" + seven, scale, -7.0f); + + CheckDecimalToReal("99999999999999999999.0000000000000000", 16, + 99999999999999999999.0f); + CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, + -99999999999999999999.0f); + + // Small fractions are within one ULP + CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9f); + CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9f); + + CheckDecimalToRealWithinOneULP("9999999.999999", 6, 9999999.999999f); + CheckDecimalToRealWithinOneULP("-9999999.999999", 6, -9999999.999999f); + + // Large fractions are within 2^-23 + constexpr float epsilon = 1.1920928955078125e-07f; // 2^-23 + CheckDecimalToRealWithinEpsilon("112334829348925.99070703983306884765625", 23, + epsilon, + 112334829348925.99070703983306884765625f); + CheckDecimalToRealWithinEpsilon("1.987748987892758765582589910934859345", 36, + epsilon, + 1.987748987892758765582589910934859345f); +} + +// ToReal tests are disabled on MinGW because of precision issues in results +#ifndef __MINGW32__ + +TEST(TestDecimalToReal, ToDoubleLargeValues) { + // Note that exact comparisons would succeed on some platforms (Linux, macOS). + // Nevertheless, power-of-ten factors are not all exactly representable + // in binary floating point. + constexpr auto max_scale = Decimal::kMaxScale; + + for (int32_t scale = -max_scale; scale <= max_scale; scale++) { + CheckDecimalToRealApprox( + "1", scale, std::pow(static_cast(10), static_cast(-scale))); + } + for (int32_t scale = -max_scale + 1; scale <= max_scale - 1; scale++) { + const auto factor = static_cast(123); + CheckDecimalToRealApprox( + "123", scale, + factor * std::pow(static_cast(10), static_cast(-scale))); + } +} + +TEST(TestDecimalToReal, ToDoublePrecision) { + // 2**63 + 2**11 (exactly representable in a double's 53 bits of precision) + CheckDecimalToReal("9223372036854777856", 0, 9.223372036854778e+18); + CheckDecimalToReal("-9223372036854777856", 0, -9.223372036854778e+18); + // 2**64 - 2**11 (exactly representable in a double) + CheckDecimalToReal("18446744073709549568", 0, 1.844674407370955e+19); + CheckDecimalToReal("-18446744073709549568", 0, -1.844674407370955e+19); + // 2**64 + 2**11 (exactly representable in a double) + CheckDecimalToReal("18446744073709555712", 0, 1.8446744073709556e+19); + CheckDecimalToReal("-18446744073709555712", 0, -1.8446744073709556e+19); + + // Almost 10**38 (minus 2**73) + CheckDecimalToReal("99999999999999978859343891977453174784", 0, + 9.999999999999998e+37); + CheckDecimalToReal("-99999999999999978859343891977453174784", 0, + -9.999999999999998e+37); + CheckDecimalToReal("99999999999999978859343891977453174784", 10, + 9.999999999999998e+27); + CheckDecimalToReal("-99999999999999978859343891977453174784", 10, + -9.999999999999998e+27); + CheckDecimalToReal("99999999999999978859343891977453174784", -10, + 9.999999999999998e+47); + CheckDecimalToReal("-99999999999999978859343891977453174784", -10, + -9.999999999999998e+47); + + // Integers are always exact + auto scale = Decimal::kMaxScale - 1; + std::string seven = "7."; + seven.append(scale, '0'); + CheckDecimalToReal(seven, scale, 7.0); + CheckDecimalToReal("-" + seven, scale, -7.0); + + CheckDecimalToReal("99999999999999999999.0000000000000000", 16, + 99999999999999999999.0); + CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, + -99999999999999999999.0); + + // Small fractions are within one ULP + CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9); + CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9); + + CheckDecimalToRealWithinOneULP("9999999.999999999999999", 15, + 9999999.999999999999999); + CheckDecimalToRealWithinOneULP("-9999999.999999999999999", 15, + -9999999.999999999999999); + // Large fractions are within 2^-52 + constexpr double epsilon = 2.220446049250313080847263336181640625e-16; // 2^-52 + CheckDecimalToRealWithinEpsilon("112334829348925.99070703983306884765625", 23, + epsilon, + 112334829348925.99070703983306884765625); + CheckDecimalToRealWithinEpsilon("1.987748987892758765582589910934859345", 36, + epsilon, + 1.987748987892758765582589910934859345); +} + +#endif // __MINGW32__ + } // namespace iceberg From 51cf0481efb81814bf4e1d5728a25abcbd5b3ae2 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 21 Aug 2025 22:53:25 +0800 Subject: [PATCH 06/20] feat: add FromBigEndian --- src/iceberg/expression/decimal.cc | 71 +++++++++++++++++++++++++++++ src/iceberg/expression/decimal.h | 16 +++++++ test/decimal_test.cc | 74 +++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index b48b08143..b378a91f5 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -1216,6 +1216,24 @@ struct DecimalRealConversion { } }; +// Helper function used by Decimal::FromBigEndian +static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) { + // We don't bounds check the length here because this is called by + // FromBigEndian that has a Decimal128 as its out parameters and + // that function is already checking the length of the bytes and only + // passes lengths between zero and eight. + uint64_t result = 0; + // Using memcpy instead of special casing for length + // and doing the conversion in 16, 32 parts, which could + // possibly create unaligned memory access on certain platforms + memcpy(reinterpret_cast(&result) + 8 - length, bytes, length); +#if ICEBERG_LITTLE_ENDIAN + return std::byteswap(result); +#else + return result; +#endif +} + } // namespace Result Decimal::FromReal(float x, int32_t precision, int32_t scale) { @@ -1233,6 +1251,59 @@ Result> Decimal::GetWholeAndFraction(int32_t scale) return Divide(multiplier); } +Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { + static constexpr int32_t kMinDecimalBytes = 1; + static constexpr int32_t kMaxDecimalBytes = 16; + + int64_t high, low; + + if (length < kMinDecimalBytes || length > kMaxDecimalBytes) { + return InvalidArgument( + "Decimal::FromBigEndian: length must be in the range [{}, {}], was {}", + kMinDecimalBytes, kMaxDecimalBytes, length); + } + + // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the + // sign bit. + const bool is_negative = static_cast(bytes[0]) < 0; + + // 1. Extract the high bytes + // Stop byte of the high bytes + const int32_t high_bits_offset = std::max(0, length - 8); + const auto high_bits = UInt64FromBigEndian(bytes, high_bits_offset); + + if (high_bits_offset == 8) { + // Avoid undefined shift by 64 below + high = high_bits; + } else { + high = -1 * (is_negative && length < kMaxDecimalBytes); + // Shift left enough bits to make room for the incoming int64_t + high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + high |= high_bits; + } + + // 2. Extract the low bytes + // Stop byte of the low bytes + const int32_t low_bits_offset = std::min(length, 8); + const auto low_bits = + UInt64FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); + + if (low_bits_offset == 8) { + // Avoid undefined shift by 64 below + low = low_bits; + } else { + // Sign extend the low bits if necessary + low = -1 * (is_negative && length < 8); + // Shift left enough bits to make room for the incoming int64_t + low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); + // Preserve the upper bits by inplace OR-ing the int64_t + low |= low_bits; + } + + return Decimal(high, static_cast(low)); +} + Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { if (orig_scale < 0 || orig_scale > kMaxScale) { return InvalidArgument( diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/expression/decimal.h index 7591fd6db..b6781393a 100644 --- a/src/iceberg/expression/decimal.h +++ b/src/iceberg/expression/decimal.h @@ -155,6 +155,11 @@ class ICEBERG_EXPORT Decimal { static Result FromReal(double real, int32_t precision, int32_t scale); static Result FromReal(float real, int32_t precision, int32_t scale); + /// \brief Convert from a big-endian byte representation. The length must be + /// between 1 and 16. + /// \return error status if the length is an invalid value + static Result FromBigEndian(const uint8_t* data, int32_t length); + /// \brief separate the integer and fractional parts for the given scale. Result> GetWholeAndFraction(int32_t scale) const; @@ -184,6 +189,17 @@ class ICEBERG_EXPORT Decimal { } } + const uint8_t* native_endian_bytes() const { + return reinterpret_cast(data_.data()); + } + + /// \brief Return the raw bytes of the value in native-endian byte order. + std::array ToBytes() const { + std::array out{{0}}; + memcpy(out.data(), data_.data(), kByteWidth); + return out; + } + /// \brief Returns 1 if positive or zero, -1 if strictly negative. int64_t Sign() const { return 1 | (static_cast(data_[kHighIndex]) >> 63); } diff --git a/test/decimal_test.cc b/test/decimal_test.cc index 213c5fbf4..ee19e4930 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -18,6 +18,7 @@ */ #include "iceberg/expression/decimal.h" +#include #include #include #include @@ -899,4 +900,77 @@ TEST(TestDecimalToReal, ToDoublePrecision) { #endif // __MINGW32__ +TEST(DecimalTest, FromBigEndian) { + // We test out a variety of scenarios: + // + // * Positive values that are left shifted + // and filled in with the same bit pattern + // * Negated of the positive values + // * Complement of the positive values + // + // For the positive values, we can call FromBigEndian + // with a length that is less than 16, whereas we must + // pass all 16 bytes for the negative and complement. + // + // We use a number of bit patterns to increase the coverage + // of scenarios + constexpr int WidthMinusOne = Decimal::kByteWidth - 1; + + for (int32_t start : {1, 15, /* 00001111 */ + 85, /* 01010101 */ + 127 /* 01111111 */}) { + Decimal value(start); + for (int ii = 0; ii < Decimal::kByteWidth; ++ii) { + auto native_endian = value.ToBytes(); +#if ICEBERG_LITTLE_ENDIAN + std::ranges::reverse(native_endian); +#endif + // Limit the number of bytes we are passing to make + // sure that it works correctly. That's why all of the + // 'start' values don't have a 1 in the most significant + // bit place + auto result = + Decimal::FromBigEndian(native_endian.data() + WidthMinusOne - ii, ii + 1); + ASSERT_THAT(result, IsOk()); + const Decimal& decimal = result.value(); + EXPECT_EQ(decimal, value); + + // Negate it + auto negated = -value; + native_endian = negated.ToBytes(); + +#if ICEBERG_LITTLE_ENDIAN + // convert to big endian + std::ranges::reverse(native_endian); +#endif + result = Decimal::FromBigEndian(native_endian.data() + WidthMinusOne - ii, ii + 1); + ASSERT_THAT(result, IsOk()); + const Decimal& negated_decimal = result.value(); + EXPECT_EQ(negated_decimal, negated); + + // Take the complement + auto complement = ~value; + native_endian = complement.ToBytes(); + +#if ICEBERG_LITTLE_ENDIAN + // convert to big endian + std::ranges::reverse(native_endian); +#endif + result = Decimal::FromBigEndian(native_endian.data(), Decimal::kByteWidth); + ASSERT_THAT(result, IsOk()); + const Decimal& complement_decimal = result.value(); + EXPECT_EQ(complement_decimal, complement); + + value <<= 2; + value += Decimal(start); + } + } +} + +TEST(DecimalTest, FromBigEndianInvalid) { + ASSERT_THAT(Decimal::FromBigEndian(nullptr, -1), IsError(ErrorKind::kInvalidArgument)); + ASSERT_THAT(Decimal::FromBigEndian(nullptr, Decimal::kByteWidth + 1), + IsError(ErrorKind::kInvalidArgument)); +} + } // namespace iceberg From 83487db4ddb85fc788d7a4762e4fa6c1dc01f460 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 21 Aug 2025 23:11:09 +0800 Subject: [PATCH 07/20] fix: include missing headers --- src/iceberg/expression/decimal.cc | 8 ++++++++ src/iceberg/expression/decimal.h | 10 +++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index b378a91f5..3868c877a 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -24,9 +24,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -1338,6 +1340,12 @@ double Decimal::ToDouble(int32_t scale) const { return DecimalRealConversion::ToReal(*this, scale); } +std::array Decimal::ToBytes() const { + std::array out{{0}}; + memcpy(out.data(), data_.data(), kByteWidth); + return out; +} + // Unary operators ICEBERG_EXPORT Decimal operator-(const Decimal& operand) { Decimal result(operand.high(), operand.low()); diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/expression/decimal.h index b6781393a..44b161620 100644 --- a/src/iceberg/expression/decimal.h +++ b/src/iceberg/expression/decimal.h @@ -168,7 +168,7 @@ class ICEBERG_EXPORT Decimal { /// \brief Whether this number fits in the given precision /// - /// Return true if the number of significant digits is less or equal to `precision`. + /// Returns true if the number of significant digits is less or equal to `precision`. bool FitsInPrecision(int32_t precision) const; /// \brief Convert to a floating-point number (scaled) @@ -193,12 +193,8 @@ class ICEBERG_EXPORT Decimal { return reinterpret_cast(data_.data()); } - /// \brief Return the raw bytes of the value in native-endian byte order. - std::array ToBytes() const { - std::array out{{0}}; - memcpy(out.data(), data_.data(), kByteWidth); - return out; - } + /// \brief Returns the raw bytes of the value in native-endian byte order. + std::array ToBytes() const; /// \brief Returns 1 if positive or zero, -1 if strictly negative. int64_t Sign() const { return 1 | (static_cast(data_[kHighIndex]) >> 63); } From aaafbfefb65e2314bb17a1e14cefb942c12c4af9 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 22 Aug 2025 20:51:59 +0800 Subject: [PATCH 08/20] feat: add more tests --- src/iceberg/expression/decimal.cc | 25 +- src/iceberg/expression/decimal.h | 25 ++ src/iceberg/result.h | 8 +- test/decimal_test.cc | 408 ++++++++++++++++++++++++++---- 4 files changed, 404 insertions(+), 62 deletions(-) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index 3868c877a..3cef87f32 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -183,7 +184,7 @@ static Status BuildFromArray(Decimal* result, const uint32_t* array, int64_t len std::array result_array = {0, 0}; for (int64_t i = length - 2 * 2 - 1; i >= 0; i--) { if (array[i] != 0) { - return Overflow("Decimal division overflow"); + return Invalid("Decimal division overflow"); } } @@ -250,7 +251,7 @@ static inline Status DecimalDivide(const Decimal& dividend, const Decimal& divis } if (divisor_length == 0) { - return DivideByZero("Cannot divide by zero in DecimalDivide"); + return Invalid("Cannot divide by zero in DecimalDivide"); } if (divisor_length == 1) { @@ -1013,9 +1014,8 @@ struct DecimalRealConversion { const auto x = std::nearbyint(real * PowerOfTen(scale)); const auto max_abs = PowerOfTen(precision); if (x <= -max_abs || x >= max_abs) { - return Overflow( - "Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", real, - precision, scale); + return Invalid("Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", + real, precision, scale); } // Extract high and low bits @@ -1048,9 +1048,8 @@ struct DecimalRealConversion { // overflow. // NOTE: `limit` is allowed here as rounding can make it smaller than // the theoretical limit (for example, 1.0e23 < 10^23). - return Overflow( - "Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", real, - precision, scale); + return Invalid("Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", + real, precision, scale); } // 2. Losslessly convert `real` to `mant * 2**k` @@ -1140,9 +1139,8 @@ struct DecimalRealConversion { // Rounding might have pushed `x` just above the max precision, check again if (!x.FitsInPrecision(precision)) { - return Overflow( - "Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", real, - precision, scale); + return Invalid("Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", + real, precision, scale); } return x; } @@ -1346,6 +1344,11 @@ std::array Decimal::ToBytes() const { return out; } +ICEBERG_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal& decimal) { + os << decimal.ToIntegerString(); + return os; +} + // Unary operators ICEBERG_EXPORT Decimal operator-(const Decimal& operand) { Decimal result(operand.high(), operand.low()); diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/expression/decimal.h index 44b161620..0d497fbfe 100644 --- a/src/iceberg/expression/decimal.h +++ b/src/iceberg/expression/decimal.h @@ -27,6 +27,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/result.h" +#include "iceberg/util/macros.h" #include "iceberg/util/port.h" namespace iceberg { @@ -171,6 +172,28 @@ class ICEBERG_EXPORT Decimal { /// Returns true if the number of significant digits is less or equal to `precision`. bool FitsInPrecision(int32_t precision) const; + /// \brief Convert to a signed integer + template + requires std::is_same_v || std::is_same_v + Result ToInteger() const { + constexpr auto min_value = std::numeric_limits::min(); + constexpr auto max_value = std::numeric_limits::max(); + const auto& self = *this; + if (self < min_value || self > max_value) { + return Invalid("Invalid cast from Decimal to {} byte integer", sizeof(T)); + } + return static_cast(low()); + } + + /// \brief Convert to a signed integer + template + requires std::is_same_v || std::is_same_v + Status ToInteger(T* out) const { + ICEBERG_ASSIGN_OR_RAISE(auto result, ToInteger()); + *out = result; + return {}; + } + /// \brief Convert to a floating-point number (scaled) float ToFloat(int32_t scale) const; /// \brief Convert to a floating-point number (scaled) @@ -212,6 +235,8 @@ class ICEBERG_EXPORT Decimal { return lhs.data_ != rhs.data_; } + friend std::ostream& operator<<(std::ostream& os, const Decimal& decimal); + private: #if ICEBERG_LITTLE_ENDIAN static constexpr int32_t kHighIndex = 1; diff --git a/src/iceberg/result.h b/src/iceberg/result.h index a70ebac5a..79dd52b93 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -32,7 +32,7 @@ enum class ErrorKind { kAlreadyExists, kCommitStateUnknown, kDecompressError, - kDivideByZero, + kInvalid, // For general invalid errors kInvalidArgument, kInvalidArrowData, kInvalidExpression, @@ -47,8 +47,6 @@ enum class ErrorKind { kNotFound, kNotImplemented, kNotSupported, - kOverflow, - kRescaleDataLoss, kUnknownError, }; @@ -82,7 +80,7 @@ using Status = Result; DEFINE_ERROR_FUNCTION(AlreadyExists) DEFINE_ERROR_FUNCTION(CommitStateUnknown) DEFINE_ERROR_FUNCTION(DecompressError) -DEFINE_ERROR_FUNCTION(DivideByZero) +DEFINE_ERROR_FUNCTION(Invalid) DEFINE_ERROR_FUNCTION(InvalidArgument) DEFINE_ERROR_FUNCTION(InvalidArrowData) DEFINE_ERROR_FUNCTION(InvalidExpression) @@ -97,8 +95,6 @@ DEFINE_ERROR_FUNCTION(NotAllowed) DEFINE_ERROR_FUNCTION(NotFound) DEFINE_ERROR_FUNCTION(NotImplemented) DEFINE_ERROR_FUNCTION(NotSupported) -DEFINE_ERROR_FUNCTION(Overflow) -DEFINE_ERROR_FUNCTION(RescaleDataLoss) DEFINE_ERROR_FUNCTION(UnknownError) #undef DEFINE_ERROR_FUNCTION diff --git a/test/decimal_test.cc b/test/decimal_test.cc index ee19e4930..61119d7b7 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -24,11 +24,12 @@ #include #include +#include #include #include #include "gmock/gmock.h" -#include "gtest/gtest.h" +#include "iceberg/util/port.h" #include "matchers.h" namespace iceberg { @@ -47,6 +48,11 @@ void AssertDecimalFromString(const std::string& s, const Decimal& expected, EXPECT_EQ(expected_scale, scale); } +Decimal DecimalFromInt128(int128_t value) { + return {static_cast(value >> 64), + static_cast(value & 0xFFFFFFFFFFFFFFFFULL)}; +} + } // namespace TEST(DecimalTest, Basics) { @@ -519,7 +525,7 @@ class TestDecimalFromReal : public ::testing::Test { }; for (const ParamType& param : overflow_params) { auto result = Decimal::FromReal(param.real, param.precision, param.scale); - ASSERT_THAT(result, IsError(ErrorKind::kOverflow)); + ASSERT_THAT(result, IsError(ErrorKind::kInvalid)); } } }; @@ -535,23 +541,52 @@ TEST(TestDecimalFromReal, FromFloat) { const std::vector params{ // -- Stress the 24 bits of precision of a float // 2**63 + 2**40 - FromFloatTestParam{9.223373e+18f, 19, 0, "9223373136366403584"}, + FromFloatTestParam{.real = 9.223373e+18f, + .precision = 19, + .scale = 0, + .expected_string = "9223373136366403584"}, // 2**64 - 2**40 - FromFloatTestParam{1.8446743e+19f, 20, 0, "18446742974197923840"}, + FromFloatTestParam{.real = 1.8446743e+19f, + .precision = 20, + .scale = 0, + .expected_string = "18446742974197923840"}, // 2**64 + 2**41 - FromFloatTestParam{1.8446746e+19f, 20, 0, "18446746272732807168"}, + FromFloatTestParam{.real = 1.8446746e+19f, + .precision = 20, + .scale = 0, + .expected_string = "18446746272732807168"}, // 2**14 - 2**-10 - FromFloatTestParam{16383.999f, 8, 3, "16383.999"}, - FromFloatTestParam{16383.999f, 19, 3, "16383.999"}, + FromFloatTestParam{ + .real = 16383.999f, .precision = 8, .scale = 3, .expected_string = "16383.999"}, + FromFloatTestParam{16383.999f, .precision = 19, .scale = 3, + .expected_string = "16383.999"}, // 1 - 2**-24 - FromFloatTestParam{0.99999994f, 10, 10, "0.9999999404"}, - FromFloatTestParam{0.99999994f, 15, 15, "0.999999940395355"}, - FromFloatTestParam{0.99999994f, 20, 20, "0.99999994039535522461"}, - FromFloatTestParam{0.99999994f, 21, 21, "0.999999940395355224609"}, - FromFloatTestParam{0.99999994f, 38, 38, "0.99999994039535522460937500000000000000"}, + FromFloatTestParam{.real = 0.99999994f, + .precision = 10, + .scale = 10, + .expected_string = "0.9999999404"}, + FromFloatTestParam{.real = 0.99999994f, + .precision = 15, + .scale = 15, + .expected_string = "0.999999940395355"}, + FromFloatTestParam{.real = 0.99999994f, + .precision = 20, + .scale = 20, + .expected_string = "0.99999994039535522461"}, + FromFloatTestParam{.real = 0.99999994f, + .precision = 21, + .scale = 21, + .expected_string = "0.999999940395355224609"}, + FromFloatTestParam{.real = 0.99999994f, + .precision = 38, + .scale = 38, + .expected_string = "0.99999994039535522460937500000000000000"}, // -- Other cases // 10**38 - 2**103 - FromFloatTestParam{9.999999e+37f, 38, 0, "99999986661652122824821048795547566080"}, + FromFloatTestParam{.real = 9.999999e+37f, + .precision = 38, + .scale = 0, + .expected_string = "99999986661652122824821048795547566080"}, }; for (const auto& param : params) { @@ -577,51 +612,123 @@ TEST(TestDecimalFromReal, FromDouble) { const std::vector params{ // -- Stress the 53 bits of precision of a double // 2**63 + 2**11 - FromDoubleTestParam{9.223372036854778e+18, 19, 0, "9223372036854777856"}, + FromDoubleTestParam{.real = 9.223372036854778e+18, + .precision = 19, + .scale = 0, + .expected_string = "9223372036854777856"}, // 2**64 - 2**11 - FromDoubleTestParam{1.844674407370955e+19, 20, 0, "18446744073709549568"}, + FromDoubleTestParam{.real = 1.844674407370955e+19, + .precision = 20, + .scale = 0, + .expected_string = "18446744073709549568"}, // 2**64 + 2**11 - FromDoubleTestParam{1.8446744073709556e+19, 20, 0, "18446744073709555712"}, + FromDoubleTestParam{.real = 1.8446744073709556e+19, + .precision = 20, + .scale = 0, + .expected_string = "18446744073709555712"}, // 2**126 - FromDoubleTestParam{8.507059173023462e+37, 38, 0, - "85070591730234615865843651857942052864"}, + FromDoubleTestParam{.real = 8.507059173023462e+37, + .precision = 38, + .scale = 0, + .expected_string = "85070591730234615865843651857942052864"}, // 2**126 - 2**74 - FromDoubleTestParam{8.50705917302346e+37, 38, 0, - "85070591730234596976377720379361198080"}, + FromDoubleTestParam{.real = 8.50705917302346e+37, + .precision = 38, + .scale = 0, + .expected_string = "85070591730234596976377720379361198080"}, // 2**36 - 2**-16 - FromDoubleTestParam{68719476735.999985, 11, 0, "68719476736"}, - FromDoubleTestParam{68719476735.999985, 38, 27, - "68719476735.999984741210937500000000000"}, + FromDoubleTestParam{.real = 68719476735.999985, + .precision = 11, + .scale = 0, + .expected_string = "68719476736"}, + FromDoubleTestParam{.real = 68719476735.999985, + .precision = 38, + .scale = 27, + .expected_string = "68719476735.999984741210937500000000000"}, // -- Other cases // Almost 10**38 (minus 2**73) - FromDoubleTestParam{9.999999999999998e+37, 38, 0, - "99999999999999978859343891977453174784"}, - FromDoubleTestParam{9.999999999999998e+27, 38, 10, - "9999999999999997384096481280.0000000000"}, + FromDoubleTestParam{.real = 9.999999999999998e+37, + .precision = 38, + .scale = 0, + .expected_string = "99999999999999978859343891977453174784"}, + FromDoubleTestParam{.real = 9.999999999999998e+27, + .precision = 38, + .scale = 10, + .expected_string = "9999999999999997384096481280.0000000000"}, // 10**N (sometimes fits in N digits) - FromDoubleTestParam{1e23, 23, 0, "99999999999999991611392"}, - FromDoubleTestParam{1e23, 24, 1, "99999999999999991611392.0"}, - FromDoubleTestParam{1e36, 37, 0, "1000000000000000042420637374017961984"}, - FromDoubleTestParam{1e36, 38, 1, "1000000000000000042420637374017961984.0"}, - FromDoubleTestParam{1e37, 37, 0, "9999999999999999538762658202121142272"}, - FromDoubleTestParam{1e37, 38, 1, "9999999999999999538762658202121142272.0"}, - FromDoubleTestParam{1e38, 38, 0, "99999999999999997748809823456034029568"}, + FromDoubleTestParam{.real = 1e23, + .precision = 23, + .scale = 0, + .expected_string = "99999999999999991611392"}, + FromDoubleTestParam{.real = 1e23, + .precision = 24, + .scale = 1, + .expected_string = "99999999999999991611392.0"}, + FromDoubleTestParam{.real = 1e36, + .precision = 37, + .scale = 0, + .expected_string = "1000000000000000042420637374017961984"}, + FromDoubleTestParam{.real = 1e36, + .precision = 38, + .scale = 1, + .expected_string = "1000000000000000042420637374017961984.0"}, + FromDoubleTestParam{.real = 1e37, + .precision = 37, + .scale = 0, + .expected_string = "9999999999999999538762658202121142272"}, + FromDoubleTestParam{.real = 1e37, + .precision = 38, + .scale = 1, + .expected_string = "9999999999999999538762658202121142272.0"}, + FromDoubleTestParam{.real = 1e38, + .precision = 38, + .scale = 0, + .expected_string = "99999999999999997748809823456034029568"}, // Hand-picked test cases that can involve precision issues. // More comprehensive testing is done in the PyArrow test suite. - FromDoubleTestParam{9.223372036854778e+10, 19, 8, "92233720368.54777527"}, - FromDoubleTestParam{1.8446744073709556e+15, 20, 4, "1844674407370955.5000"}, - FromDoubleTestParam{999999999999999.0, 16, 1, "999999999999999.0"}, - FromDoubleTestParam{9999999999999998.0, 17, 1, "9999999999999998.0"}, - FromDoubleTestParam{999999999999999.9, 16, 1, "999999999999999.9"}, - FromDoubleTestParam{9999999987., 38, 22, "9999999987.0000000000000000000000"}, - FromDoubleTestParam{9999999987., 38, 28, "9999999987.0000000000000000000000000000"}, + FromDoubleTestParam{.real = 9.223372036854778e+10, + .precision = 19, + .scale = 8, + .expected_string = "92233720368.54777527"}, + FromDoubleTestParam{.real = 1.8446744073709556e+15, + .precision = 20, + .scale = 4, + .expected_string = "1844674407370955.5000"}, + FromDoubleTestParam{.real = 999999999999999.0, + .precision = 16, + .scale = 1, + .expected_string = "999999999999999.0"}, + FromDoubleTestParam{.real = 9999999999999998.0, + .precision = 17, + .scale = 1, + .expected_string = "9999999999999998.0"}, + FromDoubleTestParam{.real = 999999999999999.9, + .precision = 16, + .scale = 1, + .expected_string = "999999999999999.9"}, + FromDoubleTestParam{.real = 9999999987., + .precision = 38, + .scale = 22, + .expected_string = "9999999987.0000000000000000000000"}, + FromDoubleTestParam{.real = 9999999987., + .precision = 38, + .scale = 28, + .expected_string = "9999999987.0000000000000000000000000000"}, // 1 - 2**-52 // XXX the result should be 0.99999999999999977795539507496869191527 // but our algorithm loses the right digit. - FromDoubleTestParam{0.9999999999999998, 38, 38, - "0.99999999999999977795539507496869191520"}, - FromDoubleTestParam{0.9999999999999998, 20, 20, "0.99999999999999977796"}, - FromDoubleTestParam{0.9999999999999998, 16, 16, "0.9999999999999998"}, + FromDoubleTestParam{.real = 0.9999999999999998, + .precision = 38, + .scale = 38, + .expected_string = "0.99999999999999977795539507496869191520"}, + FromDoubleTestParam{.real = 0.9999999999999998, + .precision = 20, + .scale = 20, + .expected_string = "0.99999999999999977796"}, + FromDoubleTestParam{.real = 0.9999999999999998, + .precision = 16, + .scale = 16, + .expected_string = "0.9999999999999998"}, }; for (const auto& param : params) { @@ -973,4 +1080,215 @@ TEST(DecimalTest, FromBigEndianInvalid) { IsError(ErrorKind::kInvalidArgument)); } +TEST(DecimalTestFunctionality, Multiply) { + ASSERT_EQ(Decimal(60501), Decimal(301) * Decimal(201)); + ASSERT_EQ(Decimal(-60501), Decimal(-301) * Decimal(201)); + ASSERT_EQ(Decimal(-60501), Decimal(301) * Decimal(-201)); + ASSERT_EQ(Decimal(60501), Decimal(-301) * Decimal(-201)); + + // Edge cases + for (auto x : std::vector{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) { + for (auto y : + std::vector{-INT32_MAX, -32, -2, -1, 0, 1, 2, 32, INT32_MAX}) { + Decimal decimal_x = DecimalFromInt128(x); + Decimal decimal_y = DecimalFromInt128(y); + Decimal result = decimal_x * decimal_y; + EXPECT_EQ(DecimalFromInt128(x * y), result) + << " x: " << decimal_x << " y: " << decimal_y; + } + } +} + +TEST(DecimalTestFunctionality, Divide) { + ASSERT_EQ(Decimal(66), Decimal(20100) / Decimal(301)); + ASSERT_EQ(Decimal(-66), Decimal(-20100) / Decimal(301)); + ASSERT_EQ(Decimal(-66), Decimal(20100) / Decimal(-301)); + ASSERT_EQ(Decimal(66), Decimal(-20100) / Decimal(-301)); + + for (auto x : std::vector{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) { + for (auto y : std::vector{-INT32_MAX, -32, -2, -1, 1, 2, 32, INT32_MAX}) { + Decimal decimal_x = DecimalFromInt128(x); + Decimal decimal_y = DecimalFromInt128(y); + Decimal result = decimal_x / decimal_y; + EXPECT_EQ(DecimalFromInt128(x / y), result) + << " x: " << decimal_x << " y: " << decimal_y; + } + } +} + +TEST(DecimalTestFunctionality, Modulo) { + ASSERT_EQ(Decimal(234), Decimal(20100) % Decimal(301)); + ASSERT_EQ(Decimal(-234), Decimal(-20100) % Decimal(301)); + ASSERT_EQ(Decimal(234), Decimal(20100) % Decimal(-301)); + ASSERT_EQ(Decimal(-234), Decimal(-20100) % Decimal(-301)); + + // Test some edge cases + for (auto x : std::vector{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) { + for (auto y : std::vector{-INT32_MAX, -32, -2, -1, 1, 2, 32, INT32_MAX}) { + Decimal decimal_x = DecimalFromInt128(x); + Decimal decimal_y = DecimalFromInt128(y); + Decimal result = decimal_x % decimal_y; + EXPECT_EQ(DecimalFromInt128(x % y), result) + << " x: " << decimal_x << " y: " << decimal_y; + } + } +} + +TEST(DecimalTestFunctionality, Sign) { + ASSERT_EQ(1, Decimal(999999).Sign()); + ASSERT_EQ(-1, Decimal(-999999).Sign()); + ASSERT_EQ(1, Decimal(0).Sign()); +} + +TEST(DecimalTestFunctionality, GetWholeAndFraction) { + Decimal value("123456"); + + auto check = [value](int32_t scale, std::pair expected) { + int32_t out; + auto result = value.GetWholeAndFraction(scale); + ASSERT_THAT(result->first.ToInteger(&out), IsOk()); + ASSERT_EQ(expected.first, out); + ASSERT_THAT(result->second.ToInteger(&out), IsOk()); + ASSERT_EQ(expected.second, out); + }; + + check(0, {123456, 0}); + check(1, {12345, 6}); + check(5, {1, 23456}); + check(7, {0, 123456}); +} + +TEST(DecimalTestFunctionality, GetWholeAndFractionNegative) { + Decimal value("-123456"); + + auto check = [value](int32_t scale, std::pair expected) { + int32_t out; + auto result = value.GetWholeAndFraction(scale); + ASSERT_THAT(result->first.ToInteger(&out), IsOk()); + ASSERT_EQ(expected.first, out); + ASSERT_THAT(result->second.ToInteger(&out), IsOk()); + ASSERT_EQ(expected.second, out); + }; + + check(0, {-123456, 0}); + check(1, {-12345, -6}); + check(5, {-1, -23456}); + check(7, {0, -123456}); +} + +TEST(DecimalTestFunctionality, FitsInPrecision) { + ASSERT_TRUE(Decimal("0").FitsInPrecision(1)); + ASSERT_TRUE(Decimal("9").FitsInPrecision(1)); + ASSERT_TRUE(Decimal("-9").FitsInPrecision(1)); + ASSERT_FALSE(Decimal("10").FitsInPrecision(1)); + ASSERT_FALSE(Decimal("-10").FitsInPrecision(1)); + + ASSERT_TRUE(Decimal("0").FitsInPrecision(2)); + ASSERT_TRUE(Decimal("10").FitsInPrecision(2)); + ASSERT_TRUE(Decimal("-10").FitsInPrecision(2)); + ASSERT_TRUE(Decimal("99").FitsInPrecision(2)); + ASSERT_TRUE(Decimal("-99").FitsInPrecision(2)); + ASSERT_FALSE(Decimal("100").FitsInPrecision(2)); + ASSERT_FALSE(Decimal("-100").FitsInPrecision(2)); + + std::string max_nines(Decimal::kMaxPrecision, '9'); + ASSERT_TRUE(Decimal(max_nines).FitsInPrecision(Decimal::kMaxPrecision)); + ASSERT_TRUE(Decimal("-" + max_nines).FitsInPrecision(Decimal::kMaxPrecision)); + + std::string max_zeros(Decimal::kMaxPrecision, '0'); + ASSERT_FALSE(Decimal("1" + max_zeros).FitsInPrecision(Decimal::kMaxPrecision)); + ASSERT_FALSE(Decimal("-1" + max_zeros).FitsInPrecision(Decimal::kMaxPrecision)); +} + +TEST(DecimalTest, LeftShift) { + auto check = [](int128_t x, uint32_t bits) { + auto expected = DecimalFromInt128(x << bits); + auto actual = DecimalFromInt128(x) << bits; + ASSERT_EQ(actual.low(), expected.low()); + ASSERT_EQ(actual.high(), expected.high()); + }; + + ASSERT_EQ(Decimal("0"), Decimal("0") << 0); + ASSERT_EQ(Decimal("0"), Decimal("0") << 1); + ASSERT_EQ(Decimal("0"), Decimal("0") << 63); + ASSERT_EQ(Decimal("0"), Decimal("0") << 127); + + check(123, 0); + check(123, 1); + check(123, 63); + check(123, 64); + check(123, 120); + + ASSERT_EQ(Decimal("199999999999998"), Decimal("99999999999999") << 1); + ASSERT_EQ(Decimal("3435973836799965640261632"), Decimal("99999999999999") << 35); + ASSERT_EQ(Decimal("120892581961461708544797985370825293824"), Decimal("99999999999999") + << 80); + + ASSERT_EQ(Decimal("1234567890123456789012"), Decimal("1234567890123456789012") << 0); + ASSERT_EQ(Decimal("2469135780246913578024"), Decimal("1234567890123456789012") << 1); + ASSERT_EQ(Decimal("88959991838777271103427858320412639232"), + Decimal("1234567890123456789012") << 56); + + check(-123, 0); + check(-123, 1); + check(-123, 63); + check(-123, 64); + check(-123, 120); + + ASSERT_EQ(Decimal("-199999999999998"), Decimal("-99999999999999") << 1); + ASSERT_EQ(Decimal("-3435973836799965640261632"), Decimal("-99999999999999") << 35); + ASSERT_EQ(Decimal("-120892581961461708544797985370825293824"), + Decimal("-99999999999999") << 80); + + ASSERT_EQ(Decimal("-1234567890123456789012"), Decimal("-1234567890123456789012") << 0); + ASSERT_EQ(Decimal("-2469135780246913578024"), Decimal("-1234567890123456789012") << 1); + ASSERT_EQ(Decimal("-88959991838777271103427858320412639232"), + Decimal("-1234567890123456789012") << 56); +} + +TEST(DecimalTest, RightShift) { + ASSERT_EQ(Decimal("0"), Decimal("0") >> 0); + ASSERT_EQ(Decimal("0"), Decimal("0") >> 1); + ASSERT_EQ(Decimal("0"), Decimal("0") >> 63); + ASSERT_EQ(Decimal("0"), Decimal("0") >> 127); + + ASSERT_EQ(Decimal("1"), Decimal("1") >> 0); + ASSERT_EQ(Decimal("0"), Decimal("1") >> 1); + ASSERT_EQ(Decimal("0"), Decimal("1") >> 63); + ASSERT_EQ(Decimal("0"), Decimal("1") >> 127); + + ASSERT_EQ(Decimal("-1"), Decimal("-1") >> 0); + ASSERT_EQ(Decimal("-1"), Decimal("-1") >> 1); + ASSERT_EQ(Decimal("-1"), Decimal("-1") >> 63); + ASSERT_EQ(Decimal("-1"), Decimal("-1") >> 127); + + ASSERT_EQ(Decimal("1096516"), Decimal("1234567890123456789012") >> 50); + ASSERT_EQ(Decimal("66"), Decimal("1234567890123456789012") >> 64); + ASSERT_EQ(Decimal("2"), Decimal("1234567890123456789012") >> 69); + ASSERT_EQ(Decimal("0"), Decimal("1234567890123456789012") >> 71); + ASSERT_EQ(Decimal("0"), Decimal("1234567890123456789012") >> 127); + + ASSERT_EQ(Decimal("-1096517"), Decimal("-1234567890123456789012") >> 50); + ASSERT_EQ(Decimal("-67"), Decimal("-1234567890123456789012") >> 64); + ASSERT_EQ(Decimal("-3"), Decimal("-1234567890123456789012") >> 69); + ASSERT_EQ(Decimal("-1"), Decimal("-1234567890123456789012") >> 71); + ASSERT_EQ(Decimal("-1"), Decimal("-1234567890123456789012") >> 127); +} + +TEST(DecimalTest, Negate) { + auto check = [](Decimal pos, Decimal neg) { + EXPECT_EQ(-pos, neg); + EXPECT_EQ(-neg, pos); + }; + + check(Decimal(0, 0), Decimal(0, 0)); + check(Decimal(0, 1), Decimal(-1, 0xFFFFFFFFFFFFFFFFULL)); + check(Decimal(0, 2), Decimal(-1, 0xFFFFFFFFFFFFFFFEULL)); + check(Decimal(0, 0x8000000000000000ULL), Decimal(-1, 0x8000000000000000ULL)); + check(Decimal(0, 0xFFFFFFFFFFFFFFFFULL), Decimal(-1, 1)); + check(Decimal(12, 0), Decimal(-12, 0)); + check(Decimal(12, 1), Decimal(-13, 0xFFFFFFFFFFFFFFFFULL)); + check(Decimal(12, 0xFFFFFFFFFFFFFFFFULL), Decimal(-13, 1)); +} + } // namespace iceberg From 00dac135201a9baa17ae7eb00341ca5fb58ca94f Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 22 Aug 2025 21:41:13 +0800 Subject: [PATCH 09/20] feat: add rescale --- src/iceberg/expression/decimal.cc | 31 ++++++++++++++++++++++++++++++- test/decimal_test.cc | 14 ++++++++++++-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index 3cef87f32..c658d2f69 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -1234,6 +1234,19 @@ static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) #endif } +static bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, + const Decimal& multiplier, Decimal* result) { + if (delta_scale < 0) { + auto res = value.Divide(multiplier); + assert(res); + *result = res->first; + return res->second != 0; + } + + *result = value * multiplier; + return (value < 0) ? *result > value : *result < value; +} + } // namespace Result Decimal::FromReal(float x, int32_t precision, int32_t scale) { @@ -1322,7 +1335,23 @@ Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { return *this; } - return NotImplemented("Decimal::Rescale is not implemented yet"); + const int32_t delta_scale = new_scale - orig_scale; + const int32_t abs_delta_scale = std::abs(delta_scale); + Decimal out; + + assert(abs_delta_scale <= kMaxScale); + + auto& multiplier = kDecimal128PowersOfTen[abs_delta_scale]; + + const bool rescale_would_cause_data_loss = + RescaleWouldCauseDataLoss(*this, delta_scale, multiplier, &out); + + if (rescale_would_cause_data_loss) { + return Invalid("Rescale would cause data loss: {} -> {}", ToIntegerString(), + out.ToIntegerString()); + } + + return out; } bool Decimal::FitsInPrecision(int32_t precision) const { diff --git a/test/decimal_test.cc b/test/decimal_test.cc index 61119d7b7..7e9d9ae3b 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -28,7 +28,6 @@ #include #include -#include "gmock/gmock.h" #include "iceberg/util/port.h" #include "matchers.h" @@ -558,7 +557,9 @@ TEST(TestDecimalFromReal, FromFloat) { // 2**14 - 2**-10 FromFloatTestParam{ .real = 16383.999f, .precision = 8, .scale = 3, .expected_string = "16383.999"}, - FromFloatTestParam{16383.999f, .precision = 19, .scale = 3, + FromFloatTestParam{.real = 16383.999f, + .precision = 19, + .scale = 3, .expected_string = "16383.999"}, // 1 - 2**-24 FromFloatTestParam{.real = 0.99999994f, @@ -1291,4 +1292,13 @@ TEST(DecimalTest, Negate) { check(Decimal(12, 0xFFFFFFFFFFFFFFFFULL), Decimal(-13, 1)); } +TEST(DecimalTest, Rescale) { + ASSERT_EQ(Decimal(11100), Decimal(111).Rescale(0, 2).value()); + ASSERT_EQ(Decimal(111), Decimal(11100).Rescale(2, 0).value()); + ASSERT_EQ(Decimal(5), Decimal(500000).Rescale(6, 1).value()); + ASSERT_EQ(Decimal(500000), Decimal(5).Rescale(1, 6).value()); + + ASSERT_THAT(Decimal(5555555).Rescale(6, 1), IsError(ErrorKind::kInvalid)); +} + } // namespace iceberg From 74742e74fc9693504686975dd2cd575ae9812b71 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 22 Aug 2025 22:55:13 +0800 Subject: [PATCH 10/20] fix: windows compiling error --- src/iceberg/expression/decimal.cc | 2 +- src/iceberg/expression/decimal.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index c658d2f69..ecf72640d 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -1373,7 +1373,7 @@ std::array Decimal::ToBytes() const { return out; } -ICEBERG_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal& decimal) { +std::ostream& operator<<(std::ostream& os, const Decimal& decimal) { os << decimal.ToIntegerString(); return os; } diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/expression/decimal.h index 0d497fbfe..80d26f33f 100644 --- a/src/iceberg/expression/decimal.h +++ b/src/iceberg/expression/decimal.h @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -235,8 +236,6 @@ class ICEBERG_EXPORT Decimal { return lhs.data_ != rhs.data_; } - friend std::ostream& operator<<(std::ostream& os, const Decimal& decimal); - private: #if ICEBERG_LITTLE_ENDIAN static constexpr int32_t kHighIndex = 1; @@ -249,6 +248,8 @@ class ICEBERG_EXPORT Decimal { std::array data_; }; +ICEBERG_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal& decimal); + // Unary operators ICEBERG_EXPORT Decimal operator-(const Decimal& operand); ICEBERG_EXPORT Decimal operator~(const Decimal& operand); From feff5bdf1f737399f23c4d5298e248d8944cf36f Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 22 Aug 2025 23:43:02 +0800 Subject: [PATCH 11/20] chore: add license --- LICENSE | 14 ++++++++++++++ src/iceberg/expression/decimal.cc | 5 +++++ src/iceberg/expression/decimal.h | 5 +++++ 3 files changed, 24 insertions(+) diff --git a/LICENSE b/LICENSE index f97d28dfe..e57409056 100644 --- a/LICENSE +++ b/LICENSE @@ -253,3 +253,17 @@ https://github.com/apache/arrow/blob/main/cpp/src/arrow/visit_type_inline.h Copyright: 2016-2025 The Apache Software Foundation. Home page: https://arrow.apache.org/ License: https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +The file src/iceberg/expression/decimal.h contains code adapted from + +https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h + +The file src/iceberg/expression/decimal.cc contains code adapted from + +https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.cc + +Copyright: 2016-2025 The Apache Software Foundation. +Home page: https://arrow.apache.org/ +License: https://www.apache.org/licenses/LICENSE-2.0 diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/expression/decimal.cc index ecf72640d..0061a411a 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/expression/decimal.cc @@ -17,6 +17,11 @@ * under the License. */ +/// \file iceberg/expression/decimal.cc +/// \brief 128-bit fixed-point decimal numbers. +/// Adapted from Apache Arrow with only Decimal128 support. +/// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.cc + #include "iceberg/expression/decimal.h" #include diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/expression/decimal.h index 80d26f33f..3b1587e82 100644 --- a/src/iceberg/expression/decimal.h +++ b/src/iceberg/expression/decimal.h @@ -19,6 +19,11 @@ #pragma once +/// \file iceberg/expression/decimal.h +/// \brief 128-bit fixed-point decimal numbers. +/// Adapted from Apache Arrow with only Decimal128 support. +/// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h + #include #include #include From 026dfc94405c46a85f2cfbc2b10148af08a9ef95 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 27 Aug 2025 21:24:51 +0800 Subject: [PATCH 12/20] fix: review comments --- LICENSE | 4 +- src/iceberg/CMakeLists.txt | 3 +- src/iceberg/{expression => util}/decimal.cc | 34 +++------- src/iceberg/{expression => util}/decimal.h | 72 ++++++++++++--------- src/iceberg/util/{port.h => int128.h} | 15 +---- test/decimal_test.cc | 29 +++++---- 6 files changed, 74 insertions(+), 83 deletions(-) rename src/iceberg/{expression => util}/decimal.cc (98%) rename src/iceberg/{expression => util}/decimal.h (87%) rename src/iceberg/util/{port.h => int128.h} (68%) diff --git a/LICENSE b/LICENSE index e57409056..b7f4c880b 100644 --- a/LICENSE +++ b/LICENSE @@ -256,11 +256,11 @@ License: https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -The file src/iceberg/expression/decimal.h contains code adapted from +The file src/iceberg/util/decimal.h contains code adapted from https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h -The file src/iceberg/expression/decimal.cc contains code adapted from +The file src/iceberg/util/decimal.cc contains code adapted from https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 882c5a681..18ec6c60e 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -19,7 +19,7 @@ set(ICEBERG_INCLUDES "$" "$") set(ICEBERG_SOURCES catalog/in_memory_catalog.cc - expression/decimal.cc + util/decimal.cc expression/expression.cc expression/literal.cc file_reader.cc @@ -50,6 +50,7 @@ set(ICEBERG_SOURCES manifest_reader_internal.cc manifest_writer.cc arrow_c_data_guard_internal.cc + util/decimal.cc util/murmurhash3_internal.cc util/timepoint.cc util/gzip_internal.cc) diff --git a/src/iceberg/expression/decimal.cc b/src/iceberg/util/decimal.cc similarity index 98% rename from src/iceberg/expression/decimal.cc rename to src/iceberg/util/decimal.cc index 0061a411a..b22db9166 100644 --- a/src/iceberg/expression/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -17,12 +17,12 @@ * under the License. */ -/// \file iceberg/expression/decimal.cc +/// \file iceberg/util/decimal.cc /// \brief 128-bit fixed-point decimal numbers. /// Adapted from Apache Arrow with only Decimal128 support. /// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.cc -#include "iceberg/expression/decimal.h" +#include "iceberg/util/decimal.h" #include #include @@ -44,6 +44,7 @@ #include #include "iceberg/result.h" +#include "iceberg/util/int128.h" #include "iceberg/util/macros.h" namespace iceberg { @@ -314,7 +315,7 @@ static inline Status DecimalDivide(const Decimal& dividend, const Decimal& divis uint32_t prev = dividend_array[i]; dividend_array[i] -= static_cast(mult); - // if guess was too big, we add back divisor + // if guess was too big, we add back divisor if (dividend_array[i] > prev) { guess--; uint32_t carry = 0; @@ -1232,11 +1233,12 @@ static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) // and doing the conversion in 16, 32 parts, which could // possibly create unaligned memory access on certain platforms memcpy(reinterpret_cast(&result) + 8 - length, bytes, length); -#if ICEBERG_LITTLE_ENDIAN - return std::byteswap(result); -#else - return result; -#endif + + if constexpr (std::endian::native == std::endian::little) { + return std::byteswap(result); + } else { + return result; + } } static bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, @@ -1420,20 +1422,4 @@ ICEBERG_EXPORT Decimal operator%(const Decimal& lhs, const Decimal& rhs) { return lhs.Divide(rhs).value().second; } -ICEBERG_EXPORT bool operator<(const Decimal& lhs, const Decimal& rhs) { - return (lhs.high() < rhs.high()) || (lhs.high() == rhs.high() && lhs.low() < rhs.low()); -} - -ICEBERG_EXPORT bool operator<=(const Decimal& lhs, const Decimal& rhs) { - return !operator>(lhs, rhs); -} - -ICEBERG_EXPORT bool operator>(const Decimal& lhs, const Decimal& rhs) { - return operator<(rhs, lhs); -} - -ICEBERG_EXPORT bool operator>=(const Decimal& lhs, const Decimal& rhs) { - return !operator<(lhs, rhs); -} - } // namespace iceberg diff --git a/src/iceberg/expression/decimal.h b/src/iceberg/util/decimal.h similarity index 87% rename from src/iceberg/expression/decimal.h rename to src/iceberg/util/decimal.h index 3b1587e82..3b42d3a13 100644 --- a/src/iceberg/expression/decimal.h +++ b/src/iceberg/util/decimal.h @@ -19,22 +19,24 @@ #pragma once -/// \file iceberg/expression/decimal.h +/// \file iceberg/util/decimal.h /// \brief 128-bit fixed-point decimal numbers. /// Adapted from Apache Arrow with only Decimal128 support. /// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h #include +#include #include #include #include #include #include +#include + #include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/util/macros.h" -#include "iceberg/util/port.h" namespace iceberg { @@ -60,24 +62,24 @@ class ICEBERG_EXPORT Decimal { constexpr Decimal(T value) noexcept // NOLINT { if (value < T{}) { - data_[kHighIndex] = ~static_cast(0); + data_[highIndex()] = ~static_cast(0); } else { - data_[kHighIndex] = 0; + data_[highIndex()] = 0; } - data_[kLowIndex] = static_cast(value); + data_[lowIndex()] = static_cast(value); } /// \brief Parse a Decimal from a string representation. explicit Decimal(std::string_view str); -/// \brief Create a Decimal from two 64-bit integers. -#if ICEBERG_LITTLE_ENDIAN - constexpr Decimal(int64_t high, uint64_t low) noexcept - : data_{low, static_cast(high)} {} -#else - constexpr Decimal(int64_t high, uint64_t low) noexcept - : data_{static_cast(high), low} {} -#endif + /// \brief Create a Decimal from two 64-bit integers. + constexpr Decimal(int64_t high, uint64_t low) noexcept { + if constexpr (std::endian::native == std::endian::little) { + data_ = {low, static_cast(high)}; + } else { + data_ = {static_cast(high), low}; + } + } /// \brief Negate the current Decimal value (in place) Decimal& Negate(); @@ -139,10 +141,10 @@ class ICEBERG_EXPORT Decimal { } /// \brief Get the high bits of the two's complement representation of the number. - constexpr int64_t high() const { return static_cast(data_[kHighIndex]); } + constexpr int64_t high() const { return static_cast(data_[highIndex()]); } /// \brief Get the low bits of the two's complement representation of the number. - constexpr uint64_t low() const { return data_[kLowIndex]; } + constexpr uint64_t low() const { return data_[lowIndex()]; } /// \brief Convert the Decimal value to a base 10 decimal string with the given scale. /// \param scale The scale to use for the string representation. @@ -178,6 +180,14 @@ class ICEBERG_EXPORT Decimal { /// Returns true if the number of significant digits is less or equal to `precision`. bool FitsInPrecision(int32_t precision) const; + /// \brief Spaceship operator for three-way comparison. + std::strong_ordering operator<=>(const Decimal& other) const { + if (high() != other.high()) { + return high() <=> other.high(); + } + return low() <=> other.low(); + } + /// \brief Convert to a signed integer template requires std::is_same_v || std::is_same_v @@ -226,10 +236,10 @@ class ICEBERG_EXPORT Decimal { std::array ToBytes() const; /// \brief Returns 1 if positive or zero, -1 if strictly negative. - int64_t Sign() const { return 1 | (static_cast(data_[kHighIndex]) >> 63); } + int64_t Sign() const { return 1 | (static_cast(data_[highIndex()]) >> 63); } /// \brief Check if the Decimal value is negative. - bool IsNegative() const { return static_cast(data_[kHighIndex]) < 0; } + bool IsNegative() const { return static_cast(data_[highIndex()]) < 0; } explicit operator bool() const { return data_ != std::array{0, 0}; } @@ -242,13 +252,21 @@ class ICEBERG_EXPORT Decimal { } private: -#if ICEBERG_LITTLE_ENDIAN - static constexpr int32_t kHighIndex = 1; - static constexpr int32_t kLowIndex = 0; -#else - static constexpr int32_t kHighIndex = 0; - static constexpr int32_t kLowIndex = 1; -#endif + static constexpr int32_t highIndex() { + if constexpr (std::endian::native == std::endian::little) { + return 1; + } else { + return 0; + } + } + + static constexpr int32_t lowIndex() { + if constexpr (std::endian::native == std::endian::little) { + return 0; + } else { + return 1; + } + } std::array data_; }; @@ -259,12 +277,6 @@ ICEBERG_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal& decimal ICEBERG_EXPORT Decimal operator-(const Decimal& operand); ICEBERG_EXPORT Decimal operator~(const Decimal& operand); -// Binary operators -ICEBERG_EXPORT bool operator<=(const Decimal& lhs, const Decimal& rhs); -ICEBERG_EXPORT bool operator<(const Decimal& lhs, const Decimal& rhs); -ICEBERG_EXPORT bool operator>=(const Decimal& lhs, const Decimal& rhs); -ICEBERG_EXPORT bool operator>(const Decimal& lhs, const Decimal& rhs); - ICEBERG_EXPORT Decimal operator+(const Decimal& lhs, const Decimal& rhs); ICEBERG_EXPORT Decimal operator-(const Decimal& lhs, const Decimal& rhs); ICEBERG_EXPORT Decimal operator*(const Decimal& lhs, const Decimal& rhs); diff --git a/src/iceberg/util/port.h b/src/iceberg/util/int128.h similarity index 68% rename from src/iceberg/util/port.h rename to src/iceberg/util/int128.h index bdd65520b..8e9d27dc7 100644 --- a/src/iceberg/util/port.h +++ b/src/iceberg/util/int128.h @@ -17,22 +17,11 @@ * under the License. */ -/// \file iceberg/util/port.h -/// \brief Portability macros and definitions for Iceberg C++ library +/// \file iceberg/util/int128.h +/// \brief 128-bit integer type #pragma once -#if defined(_WIN32) /* Windows is always little endian */ \ - || defined(__LITTLE_ENDIAN__) || \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -# define ICEBERG_LITTLE_ENDIAN 1 -#elif defined(__BIG_ENDIAN__) || \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define ICEBERG_LITTLE_ENDIAN 0 -#else -# error "Unsupported or unknown endianness" -#endif - #if defined(_MSC_VER) # include <__msvc_int128.hpp> using int128_t = std::_Signed128; diff --git a/test/decimal_test.cc b/test/decimal_test.cc index 7e9d9ae3b..1ba633bf0 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -16,10 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -#include "iceberg/expression/decimal.h" +#include "iceberg/util/decimal.h" #include #include +#include #include #include #include @@ -28,7 +29,7 @@ #include #include -#include "iceberg/util/port.h" +#include "iceberg/util/int128.h" #include "matchers.h" namespace iceberg { @@ -1030,9 +1031,10 @@ TEST(DecimalTest, FromBigEndian) { Decimal value(start); for (int ii = 0; ii < Decimal::kByteWidth; ++ii) { auto native_endian = value.ToBytes(); -#if ICEBERG_LITTLE_ENDIAN - std::ranges::reverse(native_endian); -#endif + if constexpr (std::endian::native == std::endian::little) { + // convert to big endian + std::ranges::reverse(native_endian); + } // Limit the number of bytes we are passing to make // sure that it works correctly. That's why all of the // 'start' values don't have a 1 in the most significant @@ -1047,10 +1049,11 @@ TEST(DecimalTest, FromBigEndian) { auto negated = -value; native_endian = negated.ToBytes(); -#if ICEBERG_LITTLE_ENDIAN - // convert to big endian - std::ranges::reverse(native_endian); -#endif + if constexpr (std::endian::native == std::endian::little) { + // convert to big endian + std::ranges::reverse(native_endian); + } + result = Decimal::FromBigEndian(native_endian.data() + WidthMinusOne - ii, ii + 1); ASSERT_THAT(result, IsOk()); const Decimal& negated_decimal = result.value(); @@ -1060,10 +1063,10 @@ TEST(DecimalTest, FromBigEndian) { auto complement = ~value; native_endian = complement.ToBytes(); -#if ICEBERG_LITTLE_ENDIAN - // convert to big endian - std::ranges::reverse(native_endian); -#endif + if constexpr (std::endian::native == std::endian::little) { + // convert to big endian + std::ranges::reverse(native_endian); + } result = Decimal::FromBigEndian(native_endian.data(), Decimal::kByteWidth); ASSERT_THAT(result, IsOk()); const Decimal& complement_decimal = result.value(); From c272c55b8ae0903461885769dd2f2c74dac1a2e3 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 27 Aug 2025 21:45:31 +0800 Subject: [PATCH 13/20] fix: use int128_t as Decimal data --- src/iceberg/util/decimal.cc | 8 +++----- src/iceberg/util/decimal.h | 33 +++++++++++---------------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index b22db9166..89d26635e 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -419,14 +419,12 @@ Decimal& Decimal::operator/=(const Decimal& other) { } Decimal& Decimal::operator|=(const Decimal& other) { - data_[0] |= other.data_[0]; - data_[1] |= other.data_[1]; + data_ |= other.data_; return *this; } Decimal& Decimal::operator&=(const Decimal& other) { - data_[0] &= other.data_[0]; - data_[1] &= other.data_[1]; + data_ &= other.data_; return *this; } @@ -1376,7 +1374,7 @@ double Decimal::ToDouble(int32_t scale) const { std::array Decimal::ToBytes() const { std::array out{{0}}; - memcpy(out.data(), data_.data(), kByteWidth); + memcpy(out.data(), &data_, kByteWidth); return out; } diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h index 3b42d3a13..322f706ea 100644 --- a/src/iceberg/util/decimal.h +++ b/src/iceberg/util/decimal.h @@ -36,6 +36,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/result.h" +#include "iceberg/util/int128.h" #include "iceberg/util/macros.h" namespace iceberg { @@ -51,22 +52,14 @@ class ICEBERG_EXPORT Decimal { static constexpr int32_t kMaxScale = 38; /// \brief Default constructor initializes to zero. - constexpr Decimal() noexcept : data_{0, 0} {} - - /// \brief Construct a Decimal from an array of 2 64-bit integers. - explicit Decimal(std::array data) noexcept : data_(std::move(data)) {} + constexpr Decimal() noexcept = default; /// \brief Create a Decimal from any integer not wider than 64 bits. template requires(std::is_integral_v && (sizeof(T) <= sizeof(uint64_t))) constexpr Decimal(T value) noexcept // NOLINT { - if (value < T{}) { - data_[highIndex()] = ~static_cast(0); - } else { - data_[highIndex()] = 0; - } - data_[lowIndex()] = static_cast(value); + data_ = static_cast(value); } /// \brief Parse a Decimal from a string representation. @@ -74,11 +67,7 @@ class ICEBERG_EXPORT Decimal { /// \brief Create a Decimal from two 64-bit integers. constexpr Decimal(int64_t high, uint64_t low) noexcept { - if constexpr (std::endian::native == std::endian::little) { - data_ = {low, static_cast(high)}; - } else { - data_ = {static_cast(high), low}; - } + data_ = (static_cast(high) << 64) | low; } /// \brief Negate the current Decimal value (in place) @@ -141,10 +130,10 @@ class ICEBERG_EXPORT Decimal { } /// \brief Get the high bits of the two's complement representation of the number. - constexpr int64_t high() const { return static_cast(data_[highIndex()]); } + constexpr int64_t high() const { return static_cast(data_ >> 64); } /// \brief Get the low bits of the two's complement representation of the number. - constexpr uint64_t low() const { return data_[lowIndex()]; } + constexpr uint64_t low() const { return static_cast(data_); } /// \brief Convert the Decimal value to a base 10 decimal string with the given scale. /// \param scale The scale to use for the string representation. @@ -229,19 +218,19 @@ class ICEBERG_EXPORT Decimal { } const uint8_t* native_endian_bytes() const { - return reinterpret_cast(data_.data()); + return reinterpret_cast(data_); } /// \brief Returns the raw bytes of the value in native-endian byte order. std::array ToBytes() const; /// \brief Returns 1 if positive or zero, -1 if strictly negative. - int64_t Sign() const { return 1 | (static_cast(data_[highIndex()]) >> 63); } + int64_t Sign() const { return 1 | (high() >> 63); } /// \brief Check if the Decimal value is negative. - bool IsNegative() const { return static_cast(data_[highIndex()]) < 0; } + bool IsNegative() const { return (high() >> 63) < 0; } - explicit operator bool() const { return data_ != std::array{0, 0}; } + explicit operator bool() const { return data_ != 0; } friend bool operator==(const Decimal& lhs, const Decimal& rhs) { return lhs.data_ == rhs.data_; @@ -268,7 +257,7 @@ class ICEBERG_EXPORT Decimal { } } - std::array data_; + int128_t data_{0}; }; ICEBERG_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal& decimal); From 2d5096acce248a12f960a572943c022852375f07 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 27 Aug 2025 22:15:12 +0800 Subject: [PATCH 14/20] chore: use int128_t calculation --- src/iceberg/util/decimal.cc | 652 ++++++++---------------------------- src/iceberg/util/decimal.h | 26 +- 2 files changed, 159 insertions(+), 519 deletions(-) diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index 89d26635e..80fad5cc5 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -42,6 +42,7 @@ #include #include #include +#include #include "iceberg/result.h" #include "iceberg/util/int128.h" @@ -51,61 +52,6 @@ namespace iceberg { namespace { -static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF; - -// Convenience wrapper type over 128 bit unsigned integers. This class merely provides the -// minimum necessary set of functions to perform 128 bit multiplication operations. -struct Uint128 { - Uint128() = default; - Uint128(uint64_t high, uint64_t low) - : val_((static_cast(high) << 64) | low) {} - - explicit Uint128(uint64_t value) : val_(value) {} - explicit Uint128(const Decimal& decimal) - : val_((static_cast(decimal.high()) << 64) | decimal.low()) {} - - uint64_t high() const { return static_cast(val_ >> 64); } - uint64_t low() const { return static_cast(val_ & kInt64Mask); } - - Uint128& operator+=(const Uint128& other) { - val_ += other.val_; - return *this; - } - - Uint128& operator*=(const Uint128& other) { - val_ *= other.val_; - return *this; - } - - uint128_t val_{}; -}; - -// Signed addition with well-defined behaviour on overflow (as unsigned) -template - requires std::is_signed_v -constexpr SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { - using UnsignedInt = std::make_unsigned_t; - return static_cast(static_cast(u) + - static_cast(v)); -} - -// Signed subtraction with well-defined behaviour on overflow (as unsigned) -template - requires std::is_signed_v -constexpr SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) { - using UnsignedInt = std::make_unsigned_t; - return static_cast(static_cast(u) - - static_cast(v)); -} - -// Signed negation with well-defined behaviour on overflow (as unsigned) -template - requires std::is_signed_v -constexpr SignedInt SafeSignedNegate(SignedInt u) { - using UnsignedInt = std::make_unsigned_t; - return static_cast(~static_cast(u) + 1); -} - // Signed left shift with well-defined behaviour on negative numbers or overflow template requires std::is_signed_v && std::is_integral_v @@ -114,364 +60,6 @@ constexpr SignedInt SafeLeftShift(SignedInt u, Shift shift) { return static_cast(static_cast(u) << shift); } -/// \brief Expands the given value into a big endian array of ints so that we can work on -/// it. The array will be converted to an absolute value. The array will remove leading -/// zeros from the value. -/// \param array a big endian array of length 4 to set with the value -/// \result the output length of the array -static inline int64_t FillInArray(const Decimal& value, uint32_t* array) { - Decimal abs_value = Decimal::Abs(value); - auto high = static_cast(abs_value.high()); - auto low = abs_value.low(); - - if (high != 0) { - if (high > std::numeric_limits::max()) { - array[0] = static_cast(high >> 32); - array[1] = static_cast(high); - array[2] = static_cast(low >> 32); - array[3] = static_cast(low); - return 4; - } - - array[0] = static_cast(high); - array[1] = static_cast(low >> 32); - array[2] = static_cast(low); - return 3; - } - - if (low > std::numeric_limits::max()) { - array[0] = static_cast(low >> 32); - array[1] = static_cast(low); - return 2; - } - - if (low == 0) { - return 0; - } - - array[0] = static_cast(low); - return 1; -} - -/// \brief Shift the number in the array left by bits positions. -static inline void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits > 0) { - for (int32_t i = 0; i < length - 1; ++i) { - array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits)); - } - array[length - 1] <<= bits; - } -} - -/// \brief Shift the number in the array right by bits positions. -static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits > 0) { - for (int32_t i = length - 1; i > 0; --i) { - array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits)); - } - array[0] >>= bits; - } -} - -/// \brief Fix the signs of the result and remainder after a division operation. -static inline void FixDivisionSigns(Decimal* result, Decimal* remainder, - bool dividend_negative, bool divisor_negative) { - if (dividend_negative != divisor_negative) { - result->Negate(); - } - if (dividend_negative) { - remainder->Negate(); - } -} - -/// \brief Build a Decimal from a big-endian array of uint32_t. -static Status BuildFromArray(Decimal* result, const uint32_t* array, int64_t length) { - // result_array[0] is low, result_array[1] is high - std::array result_array = {0, 0}; - for (int64_t i = length - 2 * 2 - 1; i >= 0; i--) { - if (array[i] != 0) { - return Invalid("Decimal division overflow"); - } - } - - // Construct the array in reverse order - int64_t next_index = length - 1; - for (size_t i = 0; i < 2 && next_index >= 0; i++) { - uint64_t lower_bits = array[next_index--]; - result_array[i] = - (next_index < 0) - ? lower_bits - : (static_cast(array[next_index--]) << 32) | lower_bits; - } - - *result = Decimal(result_array[1], result_array[0]); - return {}; -} - -/// \brief Do a division where the divisor fits into a single 32-bit integer. -static inline Status SingleDivide(const uint32_t* dividend, int64_t dividend_length, - uint32_t divisor, bool dividend_negative, - bool divisor_negative, Decimal* result, - Decimal* remainder) { - uint64_t r = 0; - constexpr int64_t kDecimalArrayLength = Decimal::kBitWidth / sizeof(uint32_t) + 1; - std::array quotient; - for (int64_t i = 0; i < dividend_length; ++i) { - r <<= 32; - r |= dividend[i]; - if (r < divisor) { - quotient[i] = 0; - } else { - quotient[i] = static_cast(r / divisor); - r -= static_cast(quotient[i]) * divisor; - } - } - - ICEBERG_RETURN_UNEXPECTED(BuildFromArray(result, quotient.data(), dividend_length)); - - *remainder = static_cast(r); - FixDivisionSigns(result, remainder, dividend_negative, divisor_negative); - return {}; -} - -/// \brief Divide two Decimal values and return the result and remainder. -static inline Status DecimalDivide(const Decimal& dividend, const Decimal& divisor, - Decimal* result, Decimal* remainder) { - constexpr int64_t kDecimalArrayLength = Decimal::kBitWidth / sizeof(uint32_t); - // Split the dividend and divisor into 32-bit integer pieces so that we can work on - // them. - std::array dividend_array; - std::array divisor_array; - bool dividend_negative = dividend.high() < 0; - bool divisor_negative = divisor.high() < 0; - - // leave an extra zero before the dividend - dividend_array[0] = 0; - int64_t dividend_length = FillInArray(dividend, dividend_array.data() + 1) + 1; - int64_t divisor_length = FillInArray(divisor, divisor_array.data()); - - if (dividend_length <= divisor_length) { - *remainder = dividend; - *result = 0; - return {}; - } - - if (divisor_length == 0) { - return Invalid("Cannot divide by zero in DecimalDivide"); - } - - if (divisor_length == 1) { - ICEBERG_RETURN_UNEXPECTED(SingleDivide(dividend_array.data(), dividend_length, - divisor_array[0], dividend_negative, - divisor_negative, result, remainder)); - return {}; - } - - int64_t result_length = dividend_length - divisor_length; - std::array result_array; - - assert(result_length < kDecimalArrayLength); - - // Normalize by shifting both by a multiple of 2 so that the digit guessing is easier. - // The requirement is that divisor_array[0] is greater than 2**31. - int64_t normalize_bits = std::countl_zero(divisor_array[0]); - ShiftArrayLeft(divisor_array.data(), divisor_length, normalize_bits); - ShiftArrayLeft(dividend_array.data(), dividend_length, normalize_bits); - - // compute each digit in the result - for (int64_t i = 0; i < result_length; ++i) { - // Guess the next digit. At worst it is two too large - uint32_t guess = std::numeric_limits::max(); - const auto high_dividend = - static_cast(dividend_array[i]) << 32 | dividend_array[i + 1]; - if (dividend_array[i] != divisor_array[0]) { - guess = static_cast(high_dividend / divisor_array[0]); - } - - // catch all the cases where the guess is two too large and most of the cases where it - // is one too large - auto rhat = static_cast(high_dividend - - guess * static_cast(divisor_array[0])); - while (static_cast(divisor_array[1]) * guess > - (static_cast(rhat) << 32) + dividend_array[i + 2]) { - guess--; - rhat += divisor_array[0]; - if (static_cast(rhat) < divisor_array[1]) { - break; - } - } - - // substract off the guess * divisor from the dividend - uint64_t mult = 0; - for (int64_t j = divisor_length - 1; j >= 0; --j) { - mult += static_cast(guess) * divisor_array[j]; - uint32_t prev = dividend_array[i + j + 1]; - dividend_array[i + j + 1] -= static_cast(mult); - mult >>= 32; - if (dividend_array[i + j + 1] > prev) { - ++mult; - } - } - uint32_t prev = dividend_array[i]; - dividend_array[i] -= static_cast(mult); - - // if guess was too big, we add back divisor - if (dividend_array[i] > prev) { - guess--; - uint32_t carry = 0; - for (int64_t j = divisor_length - 1; j >= 0; --j) { - const auto sum = - static_cast(divisor_array[j]) + dividend_array[i + j + 1] + carry; - dividend_array[i + j + 1] = static_cast(sum); - carry = static_cast(sum >> 32); - } - dividend_array[i] += carry; - } - - result_array[i] = guess; - } - - // denormalize the remainder - ShiftArrayRight(dividend_array.data(), dividend_length, normalize_bits); - - // return result and remainder - ICEBERG_RETURN_UNEXPECTED(BuildFromArray(result, result_array.data(), result_length)); - ICEBERG_RETURN_UNEXPECTED( - BuildFromArray(remainder, dividend_array.data(), dividend_length)); - - FixDivisionSigns(result, remainder, dividend_negative, divisor_negative); - return {}; -} - -} // namespace - -Decimal::Decimal(std::string_view str) { - auto result = Decimal::FromString(str); - if (!result) { - throw std::runtime_error( - std::format("Failed to parse Decimal from string: {}, error: {}", str, - result.error().message)); - } - *this = std::move(result.value()); -} - -Decimal& Decimal::Negate() { - uint64_t result_low = ~low() + 1; - int64_t result_high = ~high(); - if (result_low == 0) { - result_high = SafeSignedAdd(result_high, 1); - } - *this = Decimal(result_high, result_low); - return *this; -} - -Decimal& Decimal::Abs() { return *this < 0 ? Negate() : *this; } - -Decimal Decimal::Abs(const Decimal& value) { - Decimal result(value); - return result.Abs(); -} - -Decimal& Decimal::operator+=(const Decimal& other) { - int64_t result_high = SafeSignedAdd(high(), other.high()); - uint64_t result_low = low() + other.low(); - result_high = SafeSignedAdd(result_high, result_low < low()); - *this = Decimal(result_high, result_low); - return *this; -} - -Decimal& Decimal::operator-=(const Decimal& other) { - int64_t result_high = SafeSignedSubtract(high(), other.high()); - uint64_t result_low = low() - other.low(); - result_high = SafeSignedSubtract(result_high, result_low > low()); - *this = Decimal(result_high, result_low); - return *this; -} - -Decimal& Decimal::operator*=(const Decimal& other) { - // Since the max value of Decimal is supposed to be 1e38 - 1 and the min the - // negation taking the abosolute values here should aways be safe. - const bool negate = Sign() != other.Sign(); - Decimal x = Decimal::Abs(*this); - Decimal y = Decimal::Abs(other); - - Uint128 r(x); - r *= Uint128(y); - - *this = Decimal(static_cast(r.high()), r.low()); - if (negate) { - Negate(); - } - return *this; -} - -Result> Decimal::Divide(const Decimal& divisor) const { - std::pair result; - ICEBERG_RETURN_UNEXPECTED(DecimalDivide(*this, divisor, &result.first, &result.second)); - return result; -} - -Decimal& Decimal::operator/=(const Decimal& other) { - Decimal remainder; - auto s = DecimalDivide(*this, other, this, &remainder); - assert(s); - return *this; -} - -Decimal& Decimal::operator|=(const Decimal& other) { - data_ |= other.data_; - return *this; -} - -Decimal& Decimal::operator&=(const Decimal& other) { - data_ &= other.data_; - return *this; -} - -Decimal& Decimal::operator<<=(uint32_t shift) { - if (shift != 0) { - uint64_t low_; - int64_t high_; - if (shift < 64) { - high_ = SafeLeftShift(high(), shift); - high_ |= low() >> (64 - shift); - low_ = low() << shift; - } else if (shift < 128) { - high_ = static_cast(low() << (shift - 64)); - low_ = 0; - } else { - high_ = 0; - low_ = 0; - } - *this = Decimal(high_, low_); - } - - return *this; -} - -Decimal& Decimal::operator>>=(uint32_t shift) { - if (shift != 0) { - uint64_t low_; - int64_t high_; - if (shift < 64) { - low_ = low() >> shift; - low_ |= static_cast(high()) << (64 - shift); - high_ = high() >> shift; - } else if (shift < 128) { - low_ = static_cast(high() >> (shift - 64)); - high_ = high() >> 63; - } else { - high_ = high() >> 63; - low_ = static_cast(high_); - } - *this = Decimal(high_, low_); - } - - return *this; -} - -namespace { - struct DecimalComponents { std::string_view while_digits; std::string_view fractional_digits; @@ -616,7 +204,7 @@ constexpr std::array kDecimal128PowersOfTen = { Decimal(542101086242752217LL, 68739955140067328ULL), Decimal(5421010862427522170LL, 687399551400673280ULL)}; -static inline void ShiftAndAdd(std::string_view input, std::array& out) { +static inline void ShiftAndAdd(std::string_view input, uint128_t& out) { for (size_t pos = 0; pos < input.size();) { const size_t group_size = std::min(kInt64DecimalDigits, input.size() - pos); const uint64_t multiple = kUInt64PowersOfTen[group_size]; @@ -624,81 +212,11 @@ static inline void ShiftAndAdd(std::string_view input, std::array& std::from_chars(input.data() + pos, input.data() + pos + group_size, value); - for (size_t i = 0; i < 2; ++i) { - Uint128 tmp(out[i]); - tmp *= Uint128(multiple); - tmp += Uint128(value); - out[i] = tmp.low(); - value = tmp.high(); - } + out = out * multiple + value; pos += group_size; } } -// Returns a mask for the bit_index lower order bits. -// Only valid for bit_index in the range [0, 64). -constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) { - return (static_cast(1) << bit_index) - 1; -} - -static void AppendLittleEndianArrayToString(const std::array& array, - std::string* out) { - const auto most_significant_non_zero = std::ranges::find_if( - array.rbegin(), array.rend(), [](uint64_t v) { return v != 0; }); - if (most_significant_non_zero == array.rend()) { - out->push_back('0'); - return; - } - - size_t most_significant_elem_idx = &*most_significant_non_zero - array.data(); - std::array copy = array; - constexpr uint32_t k1e9 = 1000000000U; - constexpr size_t kNumBits = 128; - // Segments will contain the array split into groups that map to decimal digits, in - // little endian order. Each segment will hold at most 9 decimal digits. For example, if - // the input represents 9876543210123456789, then segments will be [123456789, - // 876543210, 9]. - // The max number of segments needed = ceil(kNumBits * log(2) / log(1e9)) - // = ceil(kNumBits / 29.897352854) <= ceil(kNumBits / 29). - std::array segments; - size_t num_segments = 0; - uint64_t* most_significant_elem = ©[most_significant_elem_idx]; - - do { - // Compute remainder = copy % 1e9 and copy = copy / 1e9. - uint32_t remainder = 0; - uint64_t* elem = most_significant_elem; - do { - // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer); - // *elem = dividend / 1e9; - // remainder = dividend % 1e9. - auto hi = static_cast(*elem >> 32); - auto lo = static_cast(*elem & LeastSignificantBitMask(32)); - uint64_t dividend_hi = (static_cast(remainder) << 32) | hi; - uint64_t quotient_hi = dividend_hi / k1e9; - remainder = static_cast(dividend_hi % k1e9); - uint64_t dividend_lo = (static_cast(remainder) << 32) | lo; - uint64_t quotient_lo = dividend_lo / k1e9; - remainder = static_cast(dividend_lo % k1e9); - *elem = (quotient_hi << 32) | quotient_lo; - } while (elem-- != copy.data()); - - segments[num_segments++] = remainder; - } while (*most_significant_elem != 0 || most_significant_elem-- != copy.data()); - - const uint32_t* segment = &segments[num_segments - 1]; - std::stringstream oss; - // First segment is formatted as-is. - oss << *segment; - // Remaining segments are formatted with leading zeros to fill 9 digits. e.g. 123 is - // formatted as "000000123" - while (segment != segments.data()) { - --segment; - oss << std::setfill('0') << std::setw(9) << *segment; - } - out->append(oss.str()); -} - static void AdjustIntegerStringWithScale(std::string* str, int32_t scale) { if (scale == 0) { return; @@ -766,6 +284,91 @@ static void AdjustIntegerStringWithScale(std::string* str, int32_t scale) { } // namespace +Decimal::Decimal(std::string_view str) { + auto result = Decimal::FromString(str); + if (!result) { + throw std::runtime_error( + std::format("Failed to parse Decimal from string: {}, error: {}", str, + result.error().message)); + } + *this = std::move(result.value()); +} + +Decimal& Decimal::Negate() { + uint128_t u = static_cast(~data_) + 1; + data_ = static_cast(u); + return *this; +} + +Decimal& Decimal::Abs() { return *this < 0 ? Negate() : *this; } + +Decimal Decimal::Abs(const Decimal& value) { + Decimal result(value); + return result.Abs(); +} + +Decimal& Decimal::operator+=(const Decimal& other) { + data_ += other.data_; + return *this; +} + +Decimal& Decimal::operator-=(const Decimal& other) { + data_ -= other.data_; + return *this; +} + +Decimal& Decimal::operator*=(const Decimal& other) { + data_ *= other.data_; + return *this; +} + +Result> Decimal::Divide(const Decimal& divisor) const { + std::pair result; + if (divisor == 0) { + return Invalid("Cannot divide by zero in Decimal::Divide"); + } + return std::make_pair(*this / divisor, *this % divisor); +} + +Decimal& Decimal::operator/=(const Decimal& other) { + data_ /= other.data_; + return *this; +} + +Decimal& Decimal::operator|=(const Decimal& other) { + data_ |= other.data_; + return *this; +} + +Decimal& Decimal::operator&=(const Decimal& other) { + data_ &= other.data_; + return *this; +} + +Decimal& Decimal::operator<<=(uint32_t shift) { + if (shift != 0) { + if (shift < 128) { + data_ = static_cast(static_cast(data_) << shift); + } else { + data_ = 0; + } + } + + return *this; +} + +Decimal& Decimal::operator>>=(uint32_t shift) { + if (shift != 0) { + if (shift < 128) { + data_ >>= shift; + } else { + data_ = (data_ < 0) ? -1 : 0; + } + } + + return *this; +} + Result Decimal::ToString(int32_t scale) const { if (scale < -kMaxScale || scale > kMaxScale) { return InvalidArgument( @@ -778,17 +381,47 @@ Result Decimal::ToString(int32_t scale) const { } std::string Decimal::ToIntegerString() const { - std::string result; - if (high() < 0) { - result.push_back('-'); - Decimal abs = *this; - abs.Negate(); - AppendLittleEndianArrayToString({abs.low(), static_cast(abs.high())}, - &result); - } else { - AppendLittleEndianArrayToString({low(), static_cast(high())}, &result); + if (data_ == 0) { + return "0"; } - return result; + + bool negative = data_ < 0; + uint128_t uval = + negative ? -static_cast(data_) : static_cast(data_); + + constexpr uint32_t k1e9 = 1000000000U; + constexpr size_t kNumBits = 128; + // Segments will contain the array split into groups that map to decimal digits, in + // little endian order. Each segment will hold at most 9 decimal digits. For example, if + // the input represents 9876543210123456789, then segments will be [123456789, + // 876543210, 9]. + // The max number of segments needed = ceil(kNumBits * log(2) / log(1e9)) + // = ceil(kNumBits / 29.897352854) <= ceil(kNumBits / 29). + std::array segments; + size_t num_segments = 0; + + while (uval > 0) { + // Compute remainder = uval % 1e9 and uval = uval / 1e9. + auto remainder = static_cast(uval % k1e9); + uval /= k1e9; + segments[num_segments++] = remainder; + } + + std::ostringstream oss; + if (negative) { + oss << '-'; + } + + // First segment is formatted as-is. + oss << segments[num_segments - 1]; + + // Remaining segments are formatted with leading zeros to fill 9 digits. e.g. 123 is + // formatted as "000000123" + for (size_t i = num_segments - 1; i-- > 0;) { + oss << std::setw(9) << std::setfill('0') << segments[i]; + } + + return oss.str(); } Result Decimal::FromString(std::string_view str, int32_t* precision, @@ -820,9 +453,10 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, // Index 1 is the high part, index 0 is the low part std::array result_array = {0, 0}; - ShiftAndAdd(dec.while_digits, result_array); - ShiftAndAdd(dec.fractional_digits, result_array); - Decimal result(static_cast(result_array[1]), result_array[0]); + uint128_t value = 0; + ShiftAndAdd(dec.while_digits, value); + ShiftAndAdd(dec.fractional_digits, value); + Decimal result(static_cast(value)); if (dec.sign == '-') { result.Negate(); @@ -1384,40 +1018,38 @@ std::ostream& operator<<(std::ostream& os, const Decimal& decimal) { } // Unary operators -ICEBERG_EXPORT Decimal operator-(const Decimal& operand) { - Decimal result(operand.high(), operand.low()); +Decimal operator-(const Decimal& operand) { + Decimal result(operand.data_); return result.Negate(); } -ICEBERG_EXPORT Decimal operator~(const Decimal& operand) { - return {~operand.high(), ~operand.low()}; -} +Decimal operator~(const Decimal& operand) { return {~operand.data_}; } // Binary operators -ICEBERG_EXPORT Decimal operator+(const Decimal& lhs, const Decimal& rhs) { +Decimal operator+(const Decimal& lhs, const Decimal& rhs) { Decimal result(lhs); result += rhs; return result; } -ICEBERG_EXPORT Decimal operator-(const Decimal& lhs, const Decimal& rhs) { +Decimal operator-(const Decimal& lhs, const Decimal& rhs) { Decimal result(lhs); result -= rhs; return result; } -ICEBERG_EXPORT Decimal operator*(const Decimal& lhs, const Decimal& rhs) { +Decimal operator*(const Decimal& lhs, const Decimal& rhs) { Decimal result(lhs); result *= rhs; return result; } -ICEBERG_EXPORT Decimal operator/(const Decimal& lhs, const Decimal& rhs) { - return lhs.Divide(rhs).value().first; +Decimal operator/(const Decimal& lhs, const Decimal& rhs) { + return lhs.data_ / rhs.data_; } -ICEBERG_EXPORT Decimal operator%(const Decimal& lhs, const Decimal& rhs) { - return lhs.Divide(rhs).value().second; +Decimal operator%(const Decimal& lhs, const Decimal& rhs) { + return lhs.data_ % rhs.data_; } } // namespace iceberg diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h index 322f706ea..678074b91 100644 --- a/src/iceberg/util/decimal.h +++ b/src/iceberg/util/decimal.h @@ -54,6 +54,10 @@ class ICEBERG_EXPORT Decimal { /// \brief Default constructor initializes to zero. constexpr Decimal() noexcept = default; + /// \brief Create a Decimal from a 128-bit integer. + constexpr Decimal(int128_t value) noexcept // NOLINT + : data_(value) {} + /// \brief Create a Decimal from any integer not wider than 64 bits. template requires(std::is_integral_v && (sizeof(T) <= sizeof(uint64_t))) @@ -129,6 +133,9 @@ class ICEBERG_EXPORT Decimal { return result; } + /// \brief Get the underlying 128-bit integer representation of the number. + constexpr int128_t value() const { return data_; } + /// \brief Get the high bits of the two's complement representation of the number. constexpr int64_t high() const { return static_cast(data_ >> 64); } @@ -218,7 +225,7 @@ class ICEBERG_EXPORT Decimal { } const uint8_t* native_endian_bytes() const { - return reinterpret_cast(data_); + return reinterpret_cast(&data_); } /// \brief Returns the raw bytes of the value in native-endian byte order. @@ -240,6 +247,15 @@ class ICEBERG_EXPORT Decimal { return lhs.data_ != rhs.data_; } + friend Decimal operator-(const Decimal& operand); + friend Decimal operator~(const Decimal& operand); + + friend Decimal operator+(const Decimal& lhs, const Decimal& rhs); + friend Decimal operator-(const Decimal& lhs, const Decimal& rhs); + friend Decimal operator*(const Decimal& lhs, const Decimal& rhs); + friend Decimal operator/(const Decimal& lhs, const Decimal& rhs); + friend Decimal operator%(const Decimal& lhs, const Decimal& rhs); + private: static constexpr int32_t highIndex() { if constexpr (std::endian::native == std::endian::little) { @@ -263,13 +279,5 @@ class ICEBERG_EXPORT Decimal { ICEBERG_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal& decimal); // Unary operators -ICEBERG_EXPORT Decimal operator-(const Decimal& operand); -ICEBERG_EXPORT Decimal operator~(const Decimal& operand); - -ICEBERG_EXPORT Decimal operator+(const Decimal& lhs, const Decimal& rhs); -ICEBERG_EXPORT Decimal operator-(const Decimal& lhs, const Decimal& rhs); -ICEBERG_EXPORT Decimal operator*(const Decimal& lhs, const Decimal& rhs); -ICEBERG_EXPORT Decimal operator/(const Decimal& lhs, const Decimal& rhs); -ICEBERG_EXPORT Decimal operator%(const Decimal& lhs, const Decimal& rhs); } // namespace iceberg From a46b2f49b4cee20caf1dcf24bf1693ac5be2b19e Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 29 Aug 2025 23:27:56 +0800 Subject: [PATCH 15/20] fix: remove unrelated codes --- src/iceberg/CMakeLists.txt | 1 - src/iceberg/util/decimal.cc | 25 +++++++---------------- src/iceberg/util/decimal.h | 18 ----------------- test/decimal_test.cc | 40 +++++++++++++------------------------ 4 files changed, 21 insertions(+), 63 deletions(-) diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 18ec6c60e..07070d5d5 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -19,7 +19,6 @@ set(ICEBERG_INCLUDES "$" "$") set(ICEBERG_SOURCES catalog/in_memory_catalog.cc - util/decimal.cc expression/expression.cc expression/literal.cc file_reader.cc diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index 80fad5cc5..faf312238 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -24,21 +24,17 @@ #include "iceberg/util/decimal.h" -#include #include #include #include #include #include #include -#include #include #include #include #include -#include #include -#include #include #include #include @@ -427,11 +423,11 @@ std::string Decimal::ToIntegerString() const { Result Decimal::FromString(std::string_view str, int32_t* precision, int32_t* scale) { if (str.empty()) { - return InvalidArgument("Decimal::FromString: empty string is not a valid Decimal"); + return InvalidArgument("Empty string is not a valid Decimal"); } DecimalComponents dec; if (!ParseDecimalComponents(str, &dec)) { - return InvalidArgument("Decimal::FromString: invalid decimal string '{}'", str); + return InvalidArgument("Invalid decimal string '{}'", str); } // Count number of significant digits (without leading zeros) @@ -451,8 +447,6 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, parsed_scale = static_cast(dec.fractional_digits.size()); } - // Index 1 is the high part, index 0 is the low part - std::array result_array = {0, 0}; uint128_t value = 0; ShiftAndAdd(dec.while_digits, value); ShiftAndAdd(dec.fractional_digits, value); @@ -466,9 +460,8 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, // For the scale to 0, to avoid negative scales (due to compatibility issues with // external systems such as databases) if (parsed_scale < -kMaxScale) { - return InvalidArgument( - "Decimal::FromString: scale must be in the range [-{}, {}], was {}", kMaxScale, - kMaxScale, parsed_scale); + return InvalidArgument("scale must be in the range [-{}, {}], was {}", kMaxScale, + kMaxScale, parsed_scale); } result *= kDecimal128PowersOfTen[-parsed_scale]; @@ -552,8 +545,6 @@ struct RealTraits { static constexpr const float* powers_of_ten() { return kFloatPowersOfTen.data(); } static constexpr float two_to_64(float x) { return x * 1.8446744e+19f; } - static constexpr float two_to_128(float x) { return x == 0 ? 0 : kFloatInf; } - static constexpr float two_to_192(float x) { return x == 0 ? 0 : kFloatInf; } static constexpr int32_t kMantissaBits = 24; // ceil(log10(2 ^ kMantissaBits)) @@ -567,8 +558,6 @@ struct RealTraits { static constexpr const double* powers_of_ten() { return kDoublePowersOfTen.data(); } static constexpr double two_to_64(double x) { return x * 1.8446744073709552e+19; } - static constexpr double two_to_128(double x) { return x * 3.402823669209385e+38; } - static constexpr double two_to_192(double x) { return x * 6.277101735386681e+57; } static constexpr int32_t kMantissaBits = 53; // ceil(log10(2 ^ kMantissaBits)) @@ -791,7 +780,7 @@ struct DecimalRealConversion { return x; } - /// An approximate conversion from Decimal128 to Real that guarantees: + /// An approximate conversion from Decimal to Real that guarantees: /// 1. If the decimal is an integer, the conversion is exact. /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact @@ -864,7 +853,7 @@ static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) // Using memcpy instead of special casing for length // and doing the conversion in 16, 32 parts, which could // possibly create unaligned memory access on certain platforms - memcpy(reinterpret_cast(&result) + 8 - length, bytes, length); + std::memcpy(reinterpret_cast(&result) + 8 - length, bytes, length); if constexpr (std::endian::native == std::endian::little) { return std::byteswap(result); @@ -1008,7 +997,7 @@ double Decimal::ToDouble(int32_t scale) const { std::array Decimal::ToBytes() const { std::array out{{0}}; - memcpy(out.data(), &data_, kByteWidth); + std::memcpy(out.data(), &data_, kByteWidth); return out; } diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h index 678074b91..f765fc468 100644 --- a/src/iceberg/util/decimal.h +++ b/src/iceberg/util/decimal.h @@ -257,27 +257,9 @@ class ICEBERG_EXPORT Decimal { friend Decimal operator%(const Decimal& lhs, const Decimal& rhs); private: - static constexpr int32_t highIndex() { - if constexpr (std::endian::native == std::endian::little) { - return 1; - } else { - return 0; - } - } - - static constexpr int32_t lowIndex() { - if constexpr (std::endian::native == std::endian::little) { - return 0; - } else { - return 1; - } - } - int128_t data_{0}; }; ICEBERG_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal& decimal); -// Unary operators - } // namespace iceberg diff --git a/test/decimal_test.cc b/test/decimal_test.cc index 1ba633bf0..61c5cca74 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -23,11 +23,9 @@ #include #include #include -#include #include #include -#include #include "iceberg/util/int128.h" #include "matchers.h" @@ -48,11 +46,6 @@ void AssertDecimalFromString(const std::string& s, const Decimal& expected, EXPECT_EQ(expected_scale, scale); } -Decimal DecimalFromInt128(int128_t value) { - return {static_cast(value >> 64), - static_cast(value & 0xFFFFFFFFFFFFFFFFULL)}; -} - } // namespace TEST(DecimalTest, Basics) { @@ -279,23 +272,21 @@ TEST(DecimalTest, FromStringInvalid) { // Empty string auto result = Decimal::FromString(""); ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); - ASSERT_THAT(result, HasErrorMessage( - "Decimal::FromString: empty string is not a valid Decimal")); + ASSERT_THAT(result, HasErrorMessage("Empty string is not a valid Decimal")); for (const auto& invalid_string : std::vector{"-", "0.0.0", "0-13-32", "a", "-23092.235-", "-+23092.235", "+-23092.235", "00a", "1e1a", "0.00123D/3", "1.23eA8", "1.23E+3A", "-1.23E--5", "1.2345E+++07"}) { auto result = Decimal::FromString(invalid_string); ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); - ASSERT_THAT(result, HasErrorMessage("Decimal::FromString: invalid decimal string")); + ASSERT_THAT(result, HasErrorMessage("Invalid decimal string")); } for (const auto& invalid_string : std::vector{"1e39", "-1e39", "9e39", "-9e39", "9.9e40", "-9.9e40"}) { auto result = Decimal::FromString(invalid_string); ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); - ASSERT_THAT(result, - HasErrorMessage("Decimal::FromString: scale must be in the range")); + ASSERT_THAT(result, HasErrorMessage("scale must be in the range")); } } @@ -1094,11 +1085,10 @@ TEST(DecimalTestFunctionality, Multiply) { for (auto x : std::vector{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) { for (auto y : std::vector{-INT32_MAX, -32, -2, -1, 0, 1, 2, 32, INT32_MAX}) { - Decimal decimal_x = DecimalFromInt128(x); - Decimal decimal_y = DecimalFromInt128(y); + Decimal decimal_x(x); + Decimal decimal_y(y); Decimal result = decimal_x * decimal_y; - EXPECT_EQ(DecimalFromInt128(x * y), result) - << " x: " << decimal_x << " y: " << decimal_y; + EXPECT_EQ(Decimal(x * y), result) << " x: " << decimal_x << " y: " << decimal_y; } } } @@ -1111,11 +1101,10 @@ TEST(DecimalTestFunctionality, Divide) { for (auto x : std::vector{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) { for (auto y : std::vector{-INT32_MAX, -32, -2, -1, 1, 2, 32, INT32_MAX}) { - Decimal decimal_x = DecimalFromInt128(x); - Decimal decimal_y = DecimalFromInt128(y); + Decimal decimal_x(x); + Decimal decimal_y(y); Decimal result = decimal_x / decimal_y; - EXPECT_EQ(DecimalFromInt128(x / y), result) - << " x: " << decimal_x << " y: " << decimal_y; + EXPECT_EQ(Decimal(x / y), result) << " x: " << decimal_x << " y: " << decimal_y; } } } @@ -1129,11 +1118,10 @@ TEST(DecimalTestFunctionality, Modulo) { // Test some edge cases for (auto x : std::vector{-INT64_MAX, -INT32_MAX, 0, INT32_MAX, INT64_MAX}) { for (auto y : std::vector{-INT32_MAX, -32, -2, -1, 1, 2, 32, INT32_MAX}) { - Decimal decimal_x = DecimalFromInt128(x); - Decimal decimal_y = DecimalFromInt128(y); + Decimal decimal_x(x); + Decimal decimal_y(y); Decimal result = decimal_x % decimal_y; - EXPECT_EQ(DecimalFromInt128(x % y), result) - << " x: " << decimal_x << " y: " << decimal_y; + EXPECT_EQ(Decimal(x % y), result) << " x: " << decimal_x << " y: " << decimal_y; } } } @@ -1206,8 +1194,8 @@ TEST(DecimalTestFunctionality, FitsInPrecision) { TEST(DecimalTest, LeftShift) { auto check = [](int128_t x, uint32_t bits) { - auto expected = DecimalFromInt128(x << bits); - auto actual = DecimalFromInt128(x) << bits; + auto expected = Decimal(x << bits); + auto actual = Decimal(x) << bits; ASSERT_EQ(actual.low(), expected.low()); ASSERT_EQ(actual.high(), expected.high()); }; From 922d930f57d07c8fd000acb419205dd411e685e5 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 3 Sep 2025 23:05:38 +0800 Subject: [PATCH 16/20] fix: review comments --- LICENSE | 6 --- src/iceberg/util/decimal.cc | 97 +++++++++++++++---------------------- src/iceberg/util/decimal.h | 17 ++++--- src/iceberg/util/macros.h | 4 ++ test/CMakeLists.txt | 7 +-- 5 files changed, 55 insertions(+), 76 deletions(-) diff --git a/LICENSE b/LICENSE index b7f4c880b..d8ddd9575 100644 --- a/LICENSE +++ b/LICENSE @@ -250,12 +250,6 @@ The file src/iceberg/util/visit_type.h contains code adapted from https://github.com/apache/arrow/blob/main/cpp/src/arrow/visit_type_inline.h -Copyright: 2016-2025 The Apache Software Foundation. -Home page: https://arrow.apache.org/ -License: https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - The file src/iceberg/util/decimal.h contains code adapted from https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index faf312238..4e8e68928 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -51,15 +51,15 @@ namespace { // Signed left shift with well-defined behaviour on negative numbers or overflow template requires std::is_signed_v && std::is_integral_v -constexpr SignedInt SafeLeftShift(SignedInt u, Shift shift) { +constexpr SignedInt SafeLeftShift(SignedInt u, Shift bits) { using UnsignedInt = std::make_unsigned_t; - return static_cast(static_cast(u) << shift); + return static_cast(static_cast(u) << bits); } struct DecimalComponents { std::string_view while_digits; std::string_view fractional_digits; - int32_t exponent; + int32_t exponent{0}; char sign{0}; bool has_exponent{false}; }; @@ -100,9 +100,8 @@ bool ParseDecimalComponents(std::string_view str, DecimalComponents* out) { // Optional dot if (IsDot(str[pos])) { - ++pos; // Second run of digits after the dot - pos = ParseDigitsRun(str, pos, &out->fractional_digits); + pos = ParseDigitsRun(str, ++pos, &out->fractional_digits); } if (out->fractional_digits.empty() && out->while_digits.empty()) { // Need at least some digits (whole or fractional) @@ -200,7 +199,7 @@ constexpr std::array kDecimal128PowersOfTen = { Decimal(542101086242752217LL, 68739955140067328ULL), Decimal(5421010862427522170LL, 687399551400673280ULL)}; -static inline void ShiftAndAdd(std::string_view input, uint128_t& out) { +inline void ShiftAndAdd(std::string_view input, uint128_t& out) { for (size_t pos = 0; pos < input.size();) { const size_t group_size = std::min(kInt64DecimalDigits, input.size() - pos); const uint64_t multiple = kUInt64PowersOfTen[group_size]; @@ -213,12 +212,11 @@ static inline void ShiftAndAdd(std::string_view input, uint128_t& out) { } } -static void AdjustIntegerStringWithScale(std::string* str, int32_t scale) { +void AdjustIntegerStringWithScale(int32_t scale, std::string* str) { if (scale == 0) { return; } - assert(str != nullptr); - assert(!str->empty()); + ICEBERG_DCHECK(str != nullptr && !str->empty(), "str must not be null or empty"); const bool is_negative = str->front() == '-'; const auto is_negative_offset = static_cast(is_negative); const auto len = static_cast(str->size()); @@ -291,7 +289,7 @@ Decimal::Decimal(std::string_view str) { } Decimal& Decimal::Negate() { - uint128_t u = static_cast(~data_) + 1; + uint128_t u = ~static_cast(data_) + 1; data_ = static_cast(u); return *this; } @@ -341,25 +339,17 @@ Decimal& Decimal::operator&=(const Decimal& other) { return *this; } -Decimal& Decimal::operator<<=(uint32_t shift) { - if (shift != 0) { - if (shift < 128) { - data_ = static_cast(static_cast(data_) << shift); - } else { - data_ = 0; - } +Decimal& Decimal::operator<<=(uint32_t bits) { + if (bits != 0) { + data_ = static_cast(static_cast(data_) << bits); } return *this; } -Decimal& Decimal::operator>>=(uint32_t shift) { - if (shift != 0) { - if (shift < 128) { - data_ >>= shift; - } else { - data_ = (data_ < 0) ? -1 : 0; - } +Decimal& Decimal::operator>>=(uint32_t bits) { + if (bits != 0) { + data_ >>= bits; } return *this; @@ -372,7 +362,7 @@ Result Decimal::ToString(int32_t scale) const { kMaxScale, scale); } std::string str(ToIntegerString()); - AdjustIntegerStringWithScale(&str, scale); + AdjustIntegerStringWithScale(scale, &str); return str; } @@ -571,7 +561,7 @@ struct DecimalRealConversion { template static Real PowerOfTen(int32_t exp) { constexpr int32_t N = kPrecomputedPowersOfTen; - assert(exp >= -N && exp <= N); + ICEBERG_DCHECK(exp >= -N && exp <= N, ""); return RealTraits::powers_of_ten()[N + exp]; } @@ -590,7 +580,7 @@ struct DecimalRealConversion { static constexpr int32_t kMaxScale = Decimal::kMaxScale; static constexpr auto& DecimalPowerOfTen(int32_t exp) { - assert(exp >= 0 && exp <= kMaxPrecision); + ICEBERG_DCHECK(exp >= 0 && exp <= kMaxPrecision, ""); return kDecimal128PowersOfTen[exp]; } @@ -635,11 +625,12 @@ struct DecimalRealConversion { } template - static Result FromPositiveApprox(Real real, int32_t precision, int32_t scale) { + static Result FromPositiveRealApprox(Real real, int32_t precision, + int32_t scale) { // Approximate algorithm that operates in the FP domain (thus subject // to precision loss). - const auto x = std::nearbyint(real * PowerOfTen(scale)); - const auto max_abs = PowerOfTen(precision); + const auto x = std::nearbyint(real * PowerOfTen(scale)); + const auto max_abs = PowerOfTen(precision); if (x <= -max_abs || x >= max_abs) { return Invalid("Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", real, precision, scale); @@ -649,10 +640,10 @@ struct DecimalRealConversion { const auto high = std::floor(std::ldexp(x, -64)); const auto low = x - std::ldexp(high, 64); - assert(high >= 0); - assert(high < 9.223372036854776e+18); // 2**63 - assert(low >= 0); - assert(low < 1.8446744073709552e+19); // 2**64 + ICEBERG_DCHECK(high >= 0, ""); + ICEBERG_DCHECK(high < 9.223372036854776e+18, ""); // 2**63 + ICEBERG_DCHECK(low >= 0, ""); + ICEBERG_DCHECK(low < 1.8446744073709552e+19, ""); // 2**64 return Decimal(static_cast(high), static_cast(low)); } @@ -665,7 +656,7 @@ struct DecimalRealConversion { // closest to `real * 10^scale`. if (scale < 0) { // Negative scales are not handled below, fall back to approx algorithm - return FromPositiveApprox(real, precision, scale); + return FromPositiveRealApprox(real, precision, scale); } // 1. Check that `real` is within acceptable bounds. @@ -737,7 +728,8 @@ struct DecimalRealConversion { total_exp += exp; // The supplementary right shift required so that // `x * 10^total_exp / 2^total_shift` fits in the decimal. - assert(static_cast(total_exp) < sizeof(kCeilLog2PowersOfTen)); + ICEBERG_DCHECK(static_cast(total_exp) < sizeof(kCeilLog2PowersOfTen), + ""); const int32_t bits = std::min(right_shift_by, kCeilLog2PowersOfTen[total_exp] - total_shift); total_shift += bits; @@ -799,7 +791,7 @@ struct DecimalRealConversion { // Split decimal into whole and fractional parts to avoid precision loss auto s = decimal.GetWholeAndFraction(scale); - assert(s); + ICEBERG_DCHECK(s, "Decimal::GetWholeAndFraction failed"); Real whole = ToRealPositiveNoSplit(s->first, 0); Real fraction = ToRealPositiveNoSplit(s->second, scale); @@ -809,8 +801,8 @@ struct DecimalRealConversion { template static Result FromReal(Real value, int32_t precision, int32_t scale) { - assert(precision >= 1 && precision <= kMaxPrecision); - assert(scale >= -kMaxScale && scale <= kMaxScale); + ICEBERG_DCHECK(precision >= 1 && precision <= kMaxPrecision, ""); + ICEBERG_DCHECK(scale >= -kMaxScale && scale <= kMaxScale, ""); if (!std::isfinite(value)) { return InvalidArgument("Cannot convert {} to Decimal", value); @@ -830,7 +822,7 @@ struct DecimalRealConversion { template static Real ToReal(const Decimal& decimal, int32_t scale) { - assert(scale >= -kMaxScale && scale <= kMaxScale); + ICEBERG_DCHECK(scale >= -kMaxScale && scale <= kMaxScale, ""); if (decimal.IsNegative()) { // Convert the absolute value to avoid precision loss @@ -866,7 +858,7 @@ static bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, const Decimal& multiplier, Decimal* result) { if (delta_scale < 0) { auto res = value.Divide(multiplier); - assert(res); + ICEBERG_DCHECK(res, "Decimal::Divide failed"); *result = res->first; return res->second != 0; } @@ -886,7 +878,7 @@ Result Decimal::FromReal(double x, int32_t precision, int32_t scale) { } Result> Decimal::GetWholeAndFraction(int32_t scale) const { - assert(scale >= 0 && scale <= kMaxScale); + ICEBERG_DCHECK(scale >= 0 && scale <= kMaxScale, ""); Decimal multiplier(kDecimal128PowersOfTen[scale]); return Divide(multiplier); @@ -946,19 +938,6 @@ Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { } Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { - if (orig_scale < 0 || orig_scale > kMaxScale) { - return InvalidArgument( - "Decimal::Rescale: original scale must be in the range [0, {}], " - "was {}", - kMaxScale, orig_scale); - } - if (new_scale < 0 || new_scale > kMaxScale) { - return InvalidArgument( - "Decimal::Rescale: new scale must be in the range [0, {}], " - "was {}", - kMaxScale, new_scale); - } - if (orig_scale == new_scale) { return *this; } @@ -967,7 +946,7 @@ Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { const int32_t abs_delta_scale = std::abs(delta_scale); Decimal out; - assert(abs_delta_scale <= kMaxScale); + ICEBERG_DCHECK(abs_delta_scale <= kMaxScale, ""); auto& multiplier = kDecimal128PowersOfTen[abs_delta_scale]; @@ -975,15 +954,15 @@ Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { RescaleWouldCauseDataLoss(*this, delta_scale, multiplier, &out); if (rescale_would_cause_data_loss) { - return Invalid("Rescale would cause data loss: {} -> {}", ToIntegerString(), - out.ToIntegerString()); + return Invalid("Rescale {} from {} to {} would cause data loss", ToIntegerString(), + orig_scale, new_scale); } return out; } bool Decimal::FitsInPrecision(int32_t precision) const { - assert(precision >= 1 && precision <= kMaxPrecision); + ICEBERG_DCHECK(precision >= 1 && precision <= kMaxPrecision, ""); return Decimal::Abs(*this) < kDecimal128PowersOfTen[precision]; } diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h index f765fc468..9da204088 100644 --- a/src/iceberg/util/decimal.h +++ b/src/iceberg/util/decimal.h @@ -25,17 +25,15 @@ /// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.h #include -#include #include #include #include #include #include -#include - #include "iceberg/iceberg_export.h" #include "iceberg/result.h" +#include "iceberg/util/formattable.h" #include "iceberg/util/int128.h" #include "iceberg/util/macros.h" @@ -44,7 +42,7 @@ namespace iceberg { /// \brief Represents 128-bit fixed-point decimal numbers. /// The max decimal precision that can be safely represented is /// 38 significant digits. -class ICEBERG_EXPORT Decimal { +class ICEBERG_EXPORT Decimal : public util::Formattable { public: static constexpr int32_t kBitWidth = 128; static constexpr int32_t kByteWidth = kBitWidth / 8; @@ -55,13 +53,13 @@ class ICEBERG_EXPORT Decimal { constexpr Decimal() noexcept = default; /// \brief Create a Decimal from a 128-bit integer. - constexpr Decimal(int128_t value) noexcept // NOLINT + constexpr Decimal(int128_t value) noexcept // NOLINT(google-explicit-constructor) : data_(value) {} /// \brief Create a Decimal from any integer not wider than 64 bits. template requires(std::is_integral_v && (sizeof(T) <= sizeof(uint64_t))) - constexpr Decimal(T value) noexcept // NOLINT + constexpr Decimal(T value) noexcept // NOLINT(google-explicit-constructor) { data_ = static_cast(value); } @@ -150,8 +148,15 @@ class ICEBERG_EXPORT Decimal { /// \brief Convert the Decimal value to an integer string. std::string ToIntegerString() const; + /// \brief Returns an integer string representation of the decimal value. + std::string ToString() const override { return ToIntegerString(); } + /// \brief Convert the decimal string to a Decimal value, optionally including precision /// and scale if they are provided not null. + /// \param str The string representation of the Decimal value. + /// \param[out] precision Optional pointer to store the precision of the parsed value. + /// \param[out] scale Optional pointer to store the scale of the parsed value. + /// \return The Decimal value. static Result FromString(std::string_view str, int32_t* precision = nullptr, int32_t* scale = nullptr); diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index f11a680cc..278035d3f 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -19,6 +19,8 @@ #pragma once +#include + #define ICEBERG_RETURN_UNEXPECTED(result) \ if (auto&& result_name = result; !result_name) [[unlikely]] { \ return std::unexpected(result_name.error()); \ @@ -36,3 +38,5 @@ #define ICEBERG_ASSIGN_OR_RAISE(lhs, rexpr) \ ICEBERG_ASSIGN_OR_RAISE_IMPL(ICEBERG_ASSIGN_OR_RAISE_NAME(result_, __COUNTER__), lhs, \ rexpr) + +#define ICEBERG_DCHECK(expr, message) assert((expr) && (message)) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e0583468b..d32a2d8bb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -75,11 +75,7 @@ add_iceberg_test(table_test table_test.cc schema_json_test.cc) -add_iceberg_test(expression_test - SOURCES - decimal_test.cc - expression_test.cc - literal_test.cc) +add_iceberg_test(expression_test SOURCES expression_test.cc literal_test.cc) add_iceberg_test(json_serde_test SOURCES @@ -91,6 +87,7 @@ add_iceberg_test(json_serde_test add_iceberg_test(util_test SOURCES config_test.cc + decimal_test.cc endian_test.cc formatter_test.cc string_util_test.cc From e293fc9f21f9ed4544cfbf05ae0dbcd0a4223f8b Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 3 Sep 2025 23:26:27 +0800 Subject: [PATCH 17/20] chore: remove FromReal/ToReal/ToInteger conversions --- src/iceberg/util/decimal.cc | 339 -------------------- src/iceberg/util/decimal.h | 49 --- test/decimal_test.cc | 621 ------------------------------------ 3 files changed, 1009 deletions(-) diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index 4e8e68928..cb10d3f37 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -519,322 +519,6 @@ constexpr std::array kDoublePowersOfTen 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76}; -// ceil(log2(10 ^ k)) for k in [0...76] -constexpr std::array kCeilLog2PowersOfTen = { - 0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40, 44, 47, 50, - 54, 57, 60, 64, 67, 70, 74, 77, 80, 84, 87, 90, 94, 97, 100, 103, - 107, 110, 113, 117, 120, 123, 127, 130, 133, 137, 140, 143, 147, 150, 153, 157, - 160, 163, 167, 170, 173, 177, 180, 183, 187, 190, 193, 196, 200, 203, 206, 210, - 213, 216, 220, 223, 226, 230, 233, 236, 240, 243, 246, 250, 253}; - -template -struct RealTraits {}; - -template <> -struct RealTraits { - static constexpr const float* powers_of_ten() { return kFloatPowersOfTen.data(); } - - static constexpr float two_to_64(float x) { return x * 1.8446744e+19f; } - - static constexpr int32_t kMantissaBits = 24; - // ceil(log10(2 ^ kMantissaBits)) - static constexpr int32_t kMantissaDigits = 8; - // Integers between zero and kMaxPreciseInteger can be precisely represented - static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; -}; - -template <> -struct RealTraits { - static constexpr const double* powers_of_ten() { return kDoublePowersOfTen.data(); } - - static constexpr double two_to_64(double x) { return x * 1.8446744073709552e+19; } - - static constexpr int32_t kMantissaBits = 53; - // ceil(log10(2 ^ kMantissaBits)) - static constexpr int32_t kMantissaDigits = 16; - // Integers between zero and kMaxPreciseInteger can be precisely represented - static constexpr uint64_t kMaxPreciseInteger = (1ULL << kMantissaBits) - 1; -}; - -struct DecimalRealConversion { - // Return 10**exp, with a fast lookup, assuming `exp` is within bounds - template - static Real PowerOfTen(int32_t exp) { - constexpr int32_t N = kPrecomputedPowersOfTen; - ICEBERG_DCHECK(exp >= -N && exp <= N, ""); - return RealTraits::powers_of_ten()[N + exp]; - } - - // Return 10**exp, with a fast lookup if possible - template - static Real LargePowerOfTen(int32_t exp) { - constexpr int32_t N = kPrecomputedPowersOfTen; - if (exp >= -N && exp <= N) { - return RealTraits::powers_of_ten()[N + exp]; - } else { - return std::pow(static_cast(10.0), static_cast(exp)); - } - } - - static constexpr int32_t kMaxPrecision = Decimal::kMaxPrecision; - static constexpr int32_t kMaxScale = Decimal::kMaxScale; - - static constexpr auto& DecimalPowerOfTen(int32_t exp) { - ICEBERG_DCHECK(exp >= 0 && exp <= kMaxPrecision, ""); - return kDecimal128PowersOfTen[exp]; - } - - // Right shift positive `x` by positive `bits`, rounded half to even - static Decimal RoundedRightShift(const Decimal& x, int32_t bits) { - if (bits == 0) { - return x; - } - int64_t result_hi = x.high(); - uint64_t result_lo = x.low(); - uint64_t shifted = 0; - while (bits >= 64) { - // Retain the information that set bits were shifted right. - // This is important to detect an exact half. - shifted = result_lo | (shifted > 0); - result_lo = result_hi; - result_hi >>= 63; // for sign - bits -= 64; - } - if (bits > 0) { - shifted = (result_lo << (64 - bits)) | (shifted > 0); - result_lo >>= bits; - result_lo |= static_cast(result_hi) << (64 - bits); - result_hi >>= bits; - } - // We almost have our result, but now do the rounding. - constexpr uint64_t kHalf = 0x8000000000000000ULL; - if (shifted > kHalf) { - // Strictly more than half => round up - result_lo += 1; - result_hi += (result_lo == 0); - } else if (shifted == kHalf) { - // Exactly half => round to even - if ((result_lo & 1) != 0) { - result_lo += 1; - result_hi += (result_lo == 0); - } - } else { - // Strictly less than half => round down - } - return Decimal{result_hi, result_lo}; - } - - template - static Result FromPositiveRealApprox(Real real, int32_t precision, - int32_t scale) { - // Approximate algorithm that operates in the FP domain (thus subject - // to precision loss). - const auto x = std::nearbyint(real * PowerOfTen(scale)); - const auto max_abs = PowerOfTen(precision); - if (x <= -max_abs || x >= max_abs) { - return Invalid("Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", - real, precision, scale); - } - - // Extract high and low bits - const auto high = std::floor(std::ldexp(x, -64)); - const auto low = x - std::ldexp(high, 64); - - ICEBERG_DCHECK(high >= 0, ""); - ICEBERG_DCHECK(high < 9.223372036854776e+18, ""); // 2**63 - ICEBERG_DCHECK(low >= 0, ""); - ICEBERG_DCHECK(low < 1.8446744073709552e+19, ""); // 2**64 - return Decimal(static_cast(high), static_cast(low)); - } - - template - static Result FromPositiveReal(Real real, int32_t precision, int32_t scale) { - constexpr int32_t kMantissaBits = RealTraits::kMantissaBits; - constexpr int32_t kMantissaDigits = RealTraits::kMantissaDigits; - - // Problem statement: construct the Decimal with the value - // closest to `real * 10^scale`. - if (scale < 0) { - // Negative scales are not handled below, fall back to approx algorithm - return FromPositiveRealApprox(real, precision, scale); - } - - // 1. Check that `real` is within acceptable bounds. - const Real limit = PowerOfTen(precision - scale); - if (real > limit) { - // Checking the limit early helps ensure the computations below do not - // overflow. - // NOTE: `limit` is allowed here as rounding can make it smaller than - // the theoretical limit (for example, 1.0e23 < 10^23). - return Invalid("Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", - real, precision, scale); - } - - // 2. Losslessly convert `real` to `mant * 2**k` - int32_t binary_exp = 0; - const Real real_mant = std::frexp(real, &binary_exp); - // `real_mant` is within 0.5 and 1 and has M bits of precision. - // Multiply it by 2^M to get an exact integer. - const auto mant = static_cast(std::ldexp(real_mant, kMantissaBits)); - const int32_t k = binary_exp - kMantissaBits; - // (note that `real = mant * 2^k`) - - // 3. Start with `mant`. - // We want to end up with `real * 10^scale` i.e. `mant * 2^k * 10^scale`. - Decimal x(mant); - - if (k < 0) { - // k < 0 (i.e. binary_exp < kMantissaBits), is probably the common case - // when converting to decimal. It implies right-shifting by -k bits, - // while multiplying by 10^scale. We also must avoid overflow (losing - // bits on the left) and precision loss (losing bits on the right). - int32_t right_shift_by = -k; - int32_t mul_by_ten_to = scale; - - // At this point, `x` has kMantissaDigits significant digits but it can - // fit kMaxPrecision (excluding sign). We can therefore multiply by up - // to 10^(kMaxPrecision - kMantissaDigits). - constexpr int32_t kSafeMulByTenTo = kMaxPrecision - kMantissaDigits; - - if (mul_by_ten_to <= kSafeMulByTenTo) { - // Scale is small enough, so we can do it all at once. - x *= DecimalPowerOfTen(mul_by_ten_to); - x = RoundedRightShift(x, right_shift_by); - } else { - // Scale is too large, we cannot multiply at once without overflow. - // We use an iterative algorithm which alternately shifts left by - // multiplying by a power of ten, and shifts right by a number of bits. - - // First multiply `x` by as large a power of ten as possible - // without overflowing. - x *= DecimalPowerOfTen(kSafeMulByTenTo); - mul_by_ten_to -= kSafeMulByTenTo; - - // `x` now has full precision. However, we know we'll only - // keep `precision` digits at the end. Extraneous bits/digits - // on the right can be safely shifted away, before multiplying - // again. - // NOTE: if `precision` is the full precision then the algorithm will - // lose the last digit. If `precision` is almost the full precision, - // there can be an off-by-one error due to rounding. - const int32_t mul_step = std::max(1, kMaxPrecision - precision); - - // The running exponent, useful to compute by how much we must - // shift right to make place on the left before the next multiply. - int32_t total_exp = 0; - int32_t total_shift = 0; - while (mul_by_ten_to > 0 && right_shift_by > 0) { - const int32_t exp = std::min(mul_by_ten_to, mul_step); - total_exp += exp; - // The supplementary right shift required so that - // `x * 10^total_exp / 2^total_shift` fits in the decimal. - ICEBERG_DCHECK(static_cast(total_exp) < sizeof(kCeilLog2PowersOfTen), - ""); - const int32_t bits = - std::min(right_shift_by, kCeilLog2PowersOfTen[total_exp] - total_shift); - total_shift += bits; - // Right shift to make place on the left, then multiply - x = RoundedRightShift(x, bits); - right_shift_by -= bits; - // Should not overflow thanks to the precautions taken - x *= DecimalPowerOfTen(exp); - mul_by_ten_to -= exp; - } - if (mul_by_ten_to > 0) { - x *= DecimalPowerOfTen(mul_by_ten_to); - } - if (right_shift_by > 0) { - x = RoundedRightShift(x, right_shift_by); - } - } - } else { - // k >= 0 implies left-shifting by k bits and multiplying by 10^scale. - // The order of these operations therefore doesn't matter. We know - // we won't overflow because of the limit check above, and we also - // won't lose any significant bits on the right. - x *= DecimalPowerOfTen(scale); - x <<= k; - } - - // Rounding might have pushed `x` just above the max precision, check again - if (!x.FitsInPrecision(precision)) { - return Invalid("Cannot convert {} to Decimal(precision = {}, scale = {}): overflow", - real, precision, scale); - } - return x; - } - - template - static Real ToRealPositiveNoSplit(const Decimal& decimal, int32_t scale) { - Real x = RealTraits::two_to_64(static_cast(decimal.high())); - x += static_cast(decimal.low()); - x *= LargePowerOfTen(-scale); - return x; - } - - /// An approximate conversion from Decimal to Real that guarantees: - /// 1. If the decimal is an integer, the conversion is exact. - /// 2. If the number of fractional digits is <= RealTraits::kMantissaDigits (e.g. - /// 8 for float and 16 for double), the conversion is within 1 ULP of the exact - /// value. - /// 3. Otherwise, the conversion is within 2^(-RealTraits::kMantissaDigits+1) - /// (e.g. 2^-23 for float and 2^-52 for double) of the exact value. - /// Here "exact value" means the closest representable value by Real. - template - static Real ToRealPositive(const Decimal& decimal, int32_t scale) { - if (scale <= 0 || - (decimal.high() == 0 && decimal.low() <= RealTraits::kMaxPreciseInteger)) { - // No need to split the decimal if it is already an integer (scale <= 0) or if it - // can be precisely represented by Real - return ToRealPositiveNoSplit(decimal, scale); - } - - // Split decimal into whole and fractional parts to avoid precision loss - auto s = decimal.GetWholeAndFraction(scale); - ICEBERG_DCHECK(s, "Decimal::GetWholeAndFraction failed"); - - Real whole = ToRealPositiveNoSplit(s->first, 0); - Real fraction = ToRealPositiveNoSplit(s->second, scale); - - return whole + fraction; - } - - template - static Result FromReal(Real value, int32_t precision, int32_t scale) { - ICEBERG_DCHECK(precision >= 1 && precision <= kMaxPrecision, ""); - ICEBERG_DCHECK(scale >= -kMaxScale && scale <= kMaxScale, ""); - - if (!std::isfinite(value)) { - return InvalidArgument("Cannot convert {} to Decimal", value); - } - - if (value == 0) { - return Decimal{}; - } - - if (value < 0) { - ICEBERG_ASSIGN_OR_RAISE(auto decimal, FromPositiveReal(-value, precision, scale)); - return decimal.Negate(); - } else { - return FromPositiveReal(value, precision, scale); - } - } - - template - static Real ToReal(const Decimal& decimal, int32_t scale) { - ICEBERG_DCHECK(scale >= -kMaxScale && scale <= kMaxScale, ""); - - if (decimal.IsNegative()) { - // Convert the absolute value to avoid precision loss - auto abs = decimal; - abs.Negate(); - return -ToRealPositive(abs, scale); - } else { - return ToRealPositive(decimal, scale); - } - } -}; - // Helper function used by Decimal::FromBigEndian static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) { // We don't bounds check the length here because this is called by @@ -869,21 +553,6 @@ static bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, } // namespace -Result Decimal::FromReal(float x, int32_t precision, int32_t scale) { - return DecimalRealConversion::FromReal(x, precision, scale); -} - -Result Decimal::FromReal(double x, int32_t precision, int32_t scale) { - return DecimalRealConversion::FromReal(x, precision, scale); -} - -Result> Decimal::GetWholeAndFraction(int32_t scale) const { - ICEBERG_DCHECK(scale >= 0 && scale <= kMaxScale, ""); - - Decimal multiplier(kDecimal128PowersOfTen[scale]); - return Divide(multiplier); -} - Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { static constexpr int32_t kMinDecimalBytes = 1; static constexpr int32_t kMaxDecimalBytes = 16; @@ -966,14 +635,6 @@ bool Decimal::FitsInPrecision(int32_t precision) const { return Decimal::Abs(*this) < kDecimal128PowersOfTen[precision]; } -float Decimal::ToFloat(int32_t scale) const { - return DecimalRealConversion::ToReal(*this, scale); -} - -double Decimal::ToDouble(int32_t scale) const { - return DecimalRealConversion::ToReal(*this, scale); -} - std::array Decimal::ToBytes() const { std::array out{{0}}; std::memcpy(out.data(), &data_, kByteWidth); diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h index 9da204088..695bbbbf1 100644 --- a/src/iceberg/util/decimal.h +++ b/src/iceberg/util/decimal.h @@ -35,7 +35,6 @@ #include "iceberg/result.h" #include "iceberg/util/formattable.h" #include "iceberg/util/int128.h" -#include "iceberg/util/macros.h" namespace iceberg { @@ -160,19 +159,11 @@ class ICEBERG_EXPORT Decimal : public util::Formattable { static Result FromString(std::string_view str, int32_t* precision = nullptr, int32_t* scale = nullptr); - /// \brief Convert the floating-point value to a Decimal value with the given - /// precision and scale. - static Result FromReal(double real, int32_t precision, int32_t scale); - static Result FromReal(float real, int32_t precision, int32_t scale); - /// \brief Convert from a big-endian byte representation. The length must be /// between 1 and 16. /// \return error status if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); - /// \brief separate the integer and fractional parts for the given scale. - Result> GetWholeAndFraction(int32_t scale) const; - /// \brief Convert Decimal from one scale to another. Result Rescale(int32_t orig_scale, int32_t new_scale) const; @@ -189,46 +180,6 @@ class ICEBERG_EXPORT Decimal : public util::Formattable { return low() <=> other.low(); } - /// \brief Convert to a signed integer - template - requires std::is_same_v || std::is_same_v - Result ToInteger() const { - constexpr auto min_value = std::numeric_limits::min(); - constexpr auto max_value = std::numeric_limits::max(); - const auto& self = *this; - if (self < min_value || self > max_value) { - return Invalid("Invalid cast from Decimal to {} byte integer", sizeof(T)); - } - return static_cast(low()); - } - - /// \brief Convert to a signed integer - template - requires std::is_same_v || std::is_same_v - Status ToInteger(T* out) const { - ICEBERG_ASSIGN_OR_RAISE(auto result, ToInteger()); - *out = result; - return {}; - } - - /// \brief Convert to a floating-point number (scaled) - float ToFloat(int32_t scale) const; - /// \brief Convert to a floating-point number (scaled) - double ToDouble(int32_t scale) const; - - /// \brief Convert the Decimal value to a floating-point value with the given scale. - /// \param scale The scale to use for the conversion. - /// \return The floating-point value. - template - requires std::is_floating_point_v - T ToReal(int32_t scale) const { - if constexpr (std::is_same_v) { - return ToFloat(scale); - } else { - return ToDouble(scale); - } - } - const uint8_t* native_endian_bytes() const { return reinterpret_cast(&data_); } diff --git a/test/decimal_test.cc b/test/decimal_test.cc index 61c5cca74..6850d7aad 100644 --- a/test/decimal_test.cc +++ b/test/decimal_test.cc @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -416,590 +415,6 @@ TEST(DecimalTest, ToString) { } } -template -struct FromRealTestParam { - Real real; - int32_t precision; - int32_t scale; - const char* expected_string; -}; - -template -void CheckDecimalFromReal(Real real, int32_t precision, int32_t scale, - const char* expected_string) { - auto result = Decimal::FromReal(real, precision, scale); - ASSERT_THAT(result, IsOk()) << "Failed to convert real to Decimal: " << real - << ", precision: " << precision << ", scale: " << scale; - const Decimal& decimal = result.value(); - EXPECT_EQ(decimal.ToString(scale).value(), expected_string); - const std::string expected_neg = - (decimal) ? "-" + std::string(expected_string) : expected_string; - auto neg_result = Decimal::FromReal(-real, precision, scale); - ASSERT_THAT(neg_result, IsOk()) - << "Failed to convert negative real to Decimal: " << -real - << ", precision: " << precision << ", scale: " << scale; - const Decimal& neg_decimal = neg_result.value(); - EXPECT_EQ(neg_decimal.ToString(scale).value(), expected_neg); -} - -template -void CheckDecimalFromRealIntegerString(Real real, int32_t precision, int32_t scale, - const char* expected_string) { - auto result = Decimal::FromReal(real, precision, scale); - ASSERT_THAT(result, IsOk()) << "Failed to convert real to Decimal: " << real - << ", precision: " << precision << ", scale: " << scale; - const Decimal& decimal = result.value(); - EXPECT_EQ(decimal.ToIntegerString(), expected_string); - const std::string expected_neg = - (decimal) ? "-" + std::string(expected_string) : expected_string; - auto neg_result = Decimal::FromReal(-real, precision, scale); - ASSERT_THAT(neg_result, IsOk()) - << "Failed to convert negative real to Decimal: " << -real - << ", precision: " << precision << ", scale: " << scale; - const Decimal& neg_decimal = neg_result.value(); - EXPECT_EQ(neg_decimal.ToIntegerString(), expected_neg); -} - -using FromFloatTestParam = FromRealTestParam; -using FromDoubleTestParam = FromRealTestParam; - -template -class TestDecimalFromReal : public ::testing::Test { - public: - using ParamType = FromRealTestParam; - - void TestSuccess() { - const std::vector params{ - // clang-format off - {0.0f, 1, 0, "0"}, - {0.0f, 19, 4, "0.0000"}, - {123.0f, 7, 4, "123.0000"}, - {456.78f, 7, 4, "456.7800"}, - {456.784f, 5, 2, "456.78"}, - {456.786f, 5, 2, "456.79"}, - {999.99f, 5, 2, "999.99"}, - {123.0f, 19, 0, "123"}, - {123.4f, 19, 0, "123"}, - {123.6f, 19, 0, "124"}, - // 2**62 - {4.6116860184273879e+18, 19, 0, "4611686018427387904"}, - // 2**63 - {9.2233720368547758e+18, 19, 0, "9223372036854775808"}, - // 2**64 - {1.8446744073709552e+19, 20, 0, "18446744073709551616"}, - // clang-format on - }; - - for (const ParamType& param : params) { - CheckDecimalFromReal(param.real, param.precision, param.scale, - param.expected_string); - } - } - - void TestErrors() { - const std::vector params{ - {std::numeric_limits::infinity(), Decimal::kMaxPrecision / 2, 4, ""}, - {-std::numeric_limits::infinity(), Decimal::kMaxPrecision / 2, 4, ""}, - {std::numeric_limits::quiet_NaN(), Decimal::kMaxPrecision / 2, 4, ""}, - {-std::numeric_limits::quiet_NaN(), Decimal::kMaxPrecision / 2, 4, ""}, - }; - - for (const ParamType& param : params) { - auto result = Decimal::FromReal(param.real, param.precision, param.scale); - ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); - } - - const std::vector overflow_params{ - // Overflow errors - {1000.0, 3, 0, ""}, {-1000.0, 3, 0, ""}, {1000.0, 5, 2, ""}, - {-1000.0, 5, 2, ""}, {999.996, 5, 2, ""}, {-999.996, 5, 2, ""}, - }; - for (const ParamType& param : overflow_params) { - auto result = Decimal::FromReal(param.real, param.precision, param.scale); - ASSERT_THAT(result, IsError(ErrorKind::kInvalid)); - } - } -}; - -using RealTypes = ::testing::Types; -TYPED_TEST_SUITE(TestDecimalFromReal, RealTypes); - -TYPED_TEST(TestDecimalFromReal, TestSuccess) { this->TestSuccess(); } - -TYPED_TEST(TestDecimalFromReal, TestErrors) { this->TestErrors(); } - -TEST(TestDecimalFromReal, FromFloat) { - const std::vector params{ - // -- Stress the 24 bits of precision of a float - // 2**63 + 2**40 - FromFloatTestParam{.real = 9.223373e+18f, - .precision = 19, - .scale = 0, - .expected_string = "9223373136366403584"}, - // 2**64 - 2**40 - FromFloatTestParam{.real = 1.8446743e+19f, - .precision = 20, - .scale = 0, - .expected_string = "18446742974197923840"}, - // 2**64 + 2**41 - FromFloatTestParam{.real = 1.8446746e+19f, - .precision = 20, - .scale = 0, - .expected_string = "18446746272732807168"}, - // 2**14 - 2**-10 - FromFloatTestParam{ - .real = 16383.999f, .precision = 8, .scale = 3, .expected_string = "16383.999"}, - FromFloatTestParam{.real = 16383.999f, - .precision = 19, - .scale = 3, - .expected_string = "16383.999"}, - // 1 - 2**-24 - FromFloatTestParam{.real = 0.99999994f, - .precision = 10, - .scale = 10, - .expected_string = "0.9999999404"}, - FromFloatTestParam{.real = 0.99999994f, - .precision = 15, - .scale = 15, - .expected_string = "0.999999940395355"}, - FromFloatTestParam{.real = 0.99999994f, - .precision = 20, - .scale = 20, - .expected_string = "0.99999994039535522461"}, - FromFloatTestParam{.real = 0.99999994f, - .precision = 21, - .scale = 21, - .expected_string = "0.999999940395355224609"}, - FromFloatTestParam{.real = 0.99999994f, - .precision = 38, - .scale = 38, - .expected_string = "0.99999994039535522460937500000000000000"}, - // -- Other cases - // 10**38 - 2**103 - FromFloatTestParam{.real = 9.999999e+37f, - .precision = 38, - .scale = 0, - .expected_string = "99999986661652122824821048795547566080"}, - }; - - for (const auto& param : params) { - CheckDecimalFromReal(param.real, param.precision, param.scale, param.expected_string); - } -} - -TEST(TestDecimalFromReal, FromFloatLargeValues) { - for (int32_t scale = -38; scale <= 38; ++scale) { - float real = std::pow(10.0f, static_cast(scale)); - CheckDecimalFromRealIntegerString(real, 1, -scale, "1"); - } - - for (int32_t scale = -37; scale <= 36; ++scale) { - float real = 123.f * std::pow(10.f, static_cast(scale)); - CheckDecimalFromRealIntegerString(real, 2, -scale - 1, "12"); - CheckDecimalFromRealIntegerString(real, 3, -scale, "123"); - CheckDecimalFromRealIntegerString(real, 4, -scale + 1, "1230"); - } -} - -TEST(TestDecimalFromReal, FromDouble) { - const std::vector params{ - // -- Stress the 53 bits of precision of a double - // 2**63 + 2**11 - FromDoubleTestParam{.real = 9.223372036854778e+18, - .precision = 19, - .scale = 0, - .expected_string = "9223372036854777856"}, - // 2**64 - 2**11 - FromDoubleTestParam{.real = 1.844674407370955e+19, - .precision = 20, - .scale = 0, - .expected_string = "18446744073709549568"}, - // 2**64 + 2**11 - FromDoubleTestParam{.real = 1.8446744073709556e+19, - .precision = 20, - .scale = 0, - .expected_string = "18446744073709555712"}, - // 2**126 - FromDoubleTestParam{.real = 8.507059173023462e+37, - .precision = 38, - .scale = 0, - .expected_string = "85070591730234615865843651857942052864"}, - // 2**126 - 2**74 - FromDoubleTestParam{.real = 8.50705917302346e+37, - .precision = 38, - .scale = 0, - .expected_string = "85070591730234596976377720379361198080"}, - // 2**36 - 2**-16 - FromDoubleTestParam{.real = 68719476735.999985, - .precision = 11, - .scale = 0, - .expected_string = "68719476736"}, - FromDoubleTestParam{.real = 68719476735.999985, - .precision = 38, - .scale = 27, - .expected_string = "68719476735.999984741210937500000000000"}, - // -- Other cases - // Almost 10**38 (minus 2**73) - FromDoubleTestParam{.real = 9.999999999999998e+37, - .precision = 38, - .scale = 0, - .expected_string = "99999999999999978859343891977453174784"}, - FromDoubleTestParam{.real = 9.999999999999998e+27, - .precision = 38, - .scale = 10, - .expected_string = "9999999999999997384096481280.0000000000"}, - // 10**N (sometimes fits in N digits) - FromDoubleTestParam{.real = 1e23, - .precision = 23, - .scale = 0, - .expected_string = "99999999999999991611392"}, - FromDoubleTestParam{.real = 1e23, - .precision = 24, - .scale = 1, - .expected_string = "99999999999999991611392.0"}, - FromDoubleTestParam{.real = 1e36, - .precision = 37, - .scale = 0, - .expected_string = "1000000000000000042420637374017961984"}, - FromDoubleTestParam{.real = 1e36, - .precision = 38, - .scale = 1, - .expected_string = "1000000000000000042420637374017961984.0"}, - FromDoubleTestParam{.real = 1e37, - .precision = 37, - .scale = 0, - .expected_string = "9999999999999999538762658202121142272"}, - FromDoubleTestParam{.real = 1e37, - .precision = 38, - .scale = 1, - .expected_string = "9999999999999999538762658202121142272.0"}, - FromDoubleTestParam{.real = 1e38, - .precision = 38, - .scale = 0, - .expected_string = "99999999999999997748809823456034029568"}, - // Hand-picked test cases that can involve precision issues. - // More comprehensive testing is done in the PyArrow test suite. - FromDoubleTestParam{.real = 9.223372036854778e+10, - .precision = 19, - .scale = 8, - .expected_string = "92233720368.54777527"}, - FromDoubleTestParam{.real = 1.8446744073709556e+15, - .precision = 20, - .scale = 4, - .expected_string = "1844674407370955.5000"}, - FromDoubleTestParam{.real = 999999999999999.0, - .precision = 16, - .scale = 1, - .expected_string = "999999999999999.0"}, - FromDoubleTestParam{.real = 9999999999999998.0, - .precision = 17, - .scale = 1, - .expected_string = "9999999999999998.0"}, - FromDoubleTestParam{.real = 999999999999999.9, - .precision = 16, - .scale = 1, - .expected_string = "999999999999999.9"}, - FromDoubleTestParam{.real = 9999999987., - .precision = 38, - .scale = 22, - .expected_string = "9999999987.0000000000000000000000"}, - FromDoubleTestParam{.real = 9999999987., - .precision = 38, - .scale = 28, - .expected_string = "9999999987.0000000000000000000000000000"}, - // 1 - 2**-52 - // XXX the result should be 0.99999999999999977795539507496869191527 - // but our algorithm loses the right digit. - FromDoubleTestParam{.real = 0.9999999999999998, - .precision = 38, - .scale = 38, - .expected_string = "0.99999999999999977795539507496869191520"}, - FromDoubleTestParam{.real = 0.9999999999999998, - .precision = 20, - .scale = 20, - .expected_string = "0.99999999999999977796"}, - FromDoubleTestParam{.real = 0.9999999999999998, - .precision = 16, - .scale = 16, - .expected_string = "0.9999999999999998"}, - }; - - for (const auto& param : params) { - CheckDecimalFromReal(param.real, param.precision, param.scale, param.expected_string); - } -} - -TEST(TestDecimalFromReal, FromDoubleLargeValues) { - constexpr auto kMaxScale = Decimal::kMaxScale; - for (int32_t scale = -kMaxScale; scale <= kMaxScale; ++scale) { - if (std::abs(1 - scale) < kMaxScale) { - double real = std::pow(10.0, static_cast(scale)); - CheckDecimalFromRealIntegerString(real, 1, -scale, "1"); - } - } - - for (int32_t scale = -kMaxScale + 1; scale <= kMaxScale - 1; ++scale) { - if (std::abs(4 - scale) < kMaxScale) { - double real = 123. * std::pow(10.0, static_cast(scale)); - CheckDecimalFromRealIntegerString(real, 2, -scale - 1, "12"); - CheckDecimalFromRealIntegerString(real, 3, -scale, "123"); - CheckDecimalFromRealIntegerString(real, 4, -scale + 1, "1230"); - } - } -} - -template -struct ToRealTestParam { - std::string decimal_value; - int32_t scale; - Real expected; -}; - -using ToFloatTestParam = ToRealTestParam; -using ToDoubleTestParam = ToRealTestParam; - -template -void CheckDecimalToReal(const std::string& decimal_value, int32_t scale, Real expected) { - auto result = Decimal::FromString(decimal_value); - ASSERT_THAT(result, IsOk()) << "Failed to convert string to Decimal: " << decimal_value; - - const Decimal& decimal = result.value(); - auto real_result = decimal.ToReal(scale); - - EXPECT_EQ(real_result, expected) - << "Expected: " << expected << ", but got: " << real_result; -} - -template -void CheckDecimalToRealWithinOneULP(const std::string& decimal_value, int32_t scale, - Real expected) { - Decimal dec(decimal_value); - Real actual = dec.template ToReal(scale); - ASSERT_TRUE(actual == expected || actual == std::nextafter(expected, expected + 1) || - actual == std::nextafter(expected, expected - 1)) - << "Decimal value: " << decimal_value << ", scale: " << scale - << ", expected: " << expected << ", actual: " << actual; -} - -template -void CheckDecimalToRealWithinEpsilon(const std::string& decimal_value, int32_t scale, - Real epsilon, Real expected) { - Decimal dec(decimal_value); - Real actual = dec.template ToReal(scale); - ASSERT_LE(std::abs(actual - expected), epsilon) - << "Decimal value: " << decimal_value << ", scale: " << scale - << ", expected: " << expected << ", actual: " << actual; -} - -void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, - float expected) { - Decimal dec(decimal_value); - auto actual = dec.template ToReal(scale); - ASSERT_FLOAT_EQ(actual, expected) - << "Decimal value: " << decimal_value << ", scale: " << scale - << ", expected: " << expected << ", actual: " << actual; -} - -void CheckDecimalToRealApprox(const std::string& decimal_value, int32_t scale, - double expected) { - Decimal dec(decimal_value); - auto actual = dec.template ToReal(scale); - ASSERT_DOUBLE_EQ(actual, expected) - << "Decimal value: " << decimal_value << ", scale: " << scale - << ", expected: " << expected << ", actual: " << actual; -} - -template -class TestDecimalToReal : public ::testing::Test { - public: - using ParamType = ToRealTestParam; - - Real Pow2(int exp) { return std::pow(static_cast(2), static_cast(exp)); } - - Real Pow10(int exp) { return std::pow(static_cast(10), static_cast(exp)); } - - void TestSuccess() { - const std::vector params{ - // clang-format off - {"0", 0, 0.0f}, - {"0", 10, 0.0f}, - {"0", -10, 0.0f}, - {"1", 0, 1.0f}, - {"12345", 0, 12345.f}, -#ifndef __MINGW32__ // MinGW has precision issues - {"12345", 1, 1234.5f}, -#endif - {"12345", -3, 12345000.f}, - // 2**62 - {"4611686018427387904", 0, Pow2(62)}, - // 2**63 + 2**62 - {"13835058055282163712", 0, Pow2(63) + Pow2(62)}, - // 2**64 + 2**62 - {"23058430092136939520", 0, Pow2(64) + Pow2(62)}, - // 10**38 - 2**103 -#ifndef __MINGW32__ // MinGW has precision issues - {"99999989858795198174164788026374356992", 0, Pow10(38) - Pow2(103)}, -#endif - // clang-format on - }; - - for (const ParamType& param : params) { - CheckDecimalToReal(param.decimal_value, param.scale, param.expected); - if (param.decimal_value != "0") { - // Check negative values as well - CheckDecimalToReal("-" + param.decimal_value, param.scale, -param.expected); - } - } - } -}; - -TYPED_TEST_SUITE(TestDecimalToReal, RealTypes); - -TYPED_TEST(TestDecimalToReal, TestSuccess) { this->TestSuccess(); } - -// Custom test for Decimal::ToReal -TEST(TestDecimalToReal, ToFloatLargeValues) { - constexpr auto max_scale = Decimal::kMaxScale; - - // Note that exact comparisons would succeed on some platforms (Linux, macOS). - // Nevertheless, power-of-ten factors are not all exactly representable - // in binary floating point. - for (int32_t scale = -max_scale; scale <= max_scale; scale++) { -#ifdef _WIN32 - // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero - if (scale == 45) continue; -#endif - CheckDecimalToRealApprox( - "1", scale, std::pow(static_cast(10), static_cast(-scale))); - } - for (int32_t scale = -max_scale; scale <= max_scale - 2; scale++) { -#ifdef _WIN32 - // MSVC gives pow(10.f, -45.f) == 0 even though 1e-45f is nonzero - if (scale == 45) continue; -#endif - const auto factor = static_cast(123); - CheckDecimalToRealApprox( - "123", scale, - factor * std::pow(static_cast(10), static_cast(-scale))); - } -} - -TEST(TestDecimalToReal, ToFloatPrecision) { - // 2**63 + 2**40 (exactly representable in a float's 24 bits of precision) - CheckDecimalToReal("9223373136366403584", 0, 9.223373e+18f); - CheckDecimalToReal("-9223373136366403584", 0, -9.223373e+18f); - - // 2**64 + 2**41 (exactly representable in a float) - CheckDecimalToReal("18446746272732807168", 0, 1.8446746e+19f); - CheckDecimalToReal("-18446746272732807168", 0, -1.8446746e+19f); - - // Integers are always exact - auto scale = Decimal::kMaxScale - 1; - std::string seven = "7."; - seven.append(scale, '0'); // pad with trailing zeros - CheckDecimalToReal(seven, scale, 7.0f); - CheckDecimalToReal("-" + seven, scale, -7.0f); - - CheckDecimalToReal("99999999999999999999.0000000000000000", 16, - 99999999999999999999.0f); - CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, - -99999999999999999999.0f); - - // Small fractions are within one ULP - CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9f); - CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9f); - - CheckDecimalToRealWithinOneULP("9999999.999999", 6, 9999999.999999f); - CheckDecimalToRealWithinOneULP("-9999999.999999", 6, -9999999.999999f); - - // Large fractions are within 2^-23 - constexpr float epsilon = 1.1920928955078125e-07f; // 2^-23 - CheckDecimalToRealWithinEpsilon("112334829348925.99070703983306884765625", 23, - epsilon, - 112334829348925.99070703983306884765625f); - CheckDecimalToRealWithinEpsilon("1.987748987892758765582589910934859345", 36, - epsilon, - 1.987748987892758765582589910934859345f); -} - -// ToReal tests are disabled on MinGW because of precision issues in results -#ifndef __MINGW32__ - -TEST(TestDecimalToReal, ToDoubleLargeValues) { - // Note that exact comparisons would succeed on some platforms (Linux, macOS). - // Nevertheless, power-of-ten factors are not all exactly representable - // in binary floating point. - constexpr auto max_scale = Decimal::kMaxScale; - - for (int32_t scale = -max_scale; scale <= max_scale; scale++) { - CheckDecimalToRealApprox( - "1", scale, std::pow(static_cast(10), static_cast(-scale))); - } - for (int32_t scale = -max_scale + 1; scale <= max_scale - 1; scale++) { - const auto factor = static_cast(123); - CheckDecimalToRealApprox( - "123", scale, - factor * std::pow(static_cast(10), static_cast(-scale))); - } -} - -TEST(TestDecimalToReal, ToDoublePrecision) { - // 2**63 + 2**11 (exactly representable in a double's 53 bits of precision) - CheckDecimalToReal("9223372036854777856", 0, 9.223372036854778e+18); - CheckDecimalToReal("-9223372036854777856", 0, -9.223372036854778e+18); - // 2**64 - 2**11 (exactly representable in a double) - CheckDecimalToReal("18446744073709549568", 0, 1.844674407370955e+19); - CheckDecimalToReal("-18446744073709549568", 0, -1.844674407370955e+19); - // 2**64 + 2**11 (exactly representable in a double) - CheckDecimalToReal("18446744073709555712", 0, 1.8446744073709556e+19); - CheckDecimalToReal("-18446744073709555712", 0, -1.8446744073709556e+19); - - // Almost 10**38 (minus 2**73) - CheckDecimalToReal("99999999999999978859343891977453174784", 0, - 9.999999999999998e+37); - CheckDecimalToReal("-99999999999999978859343891977453174784", 0, - -9.999999999999998e+37); - CheckDecimalToReal("99999999999999978859343891977453174784", 10, - 9.999999999999998e+27); - CheckDecimalToReal("-99999999999999978859343891977453174784", 10, - -9.999999999999998e+27); - CheckDecimalToReal("99999999999999978859343891977453174784", -10, - 9.999999999999998e+47); - CheckDecimalToReal("-99999999999999978859343891977453174784", -10, - -9.999999999999998e+47); - - // Integers are always exact - auto scale = Decimal::kMaxScale - 1; - std::string seven = "7."; - seven.append(scale, '0'); - CheckDecimalToReal(seven, scale, 7.0); - CheckDecimalToReal("-" + seven, scale, -7.0); - - CheckDecimalToReal("99999999999999999999.0000000000000000", 16, - 99999999999999999999.0); - CheckDecimalToReal("-99999999999999999999.0000000000000000", 16, - -99999999999999999999.0); - - // Small fractions are within one ULP - CheckDecimalToRealWithinOneULP("9999999.9", 1, 9999999.9); - CheckDecimalToRealWithinOneULP("-9999999.9", 1, -9999999.9); - - CheckDecimalToRealWithinOneULP("9999999.999999999999999", 15, - 9999999.999999999999999); - CheckDecimalToRealWithinOneULP("-9999999.999999999999999", 15, - -9999999.999999999999999); - // Large fractions are within 2^-52 - constexpr double epsilon = 2.220446049250313080847263336181640625e-16; // 2^-52 - CheckDecimalToRealWithinEpsilon("112334829348925.99070703983306884765625", 23, - epsilon, - 112334829348925.99070703983306884765625); - CheckDecimalToRealWithinEpsilon("1.987748987892758765582589910934859345", 36, - epsilon, - 1.987748987892758765582589910934859345); -} - -#endif // __MINGW32__ - TEST(DecimalTest, FromBigEndian) { // We test out a variety of scenarios: // @@ -1132,42 +547,6 @@ TEST(DecimalTestFunctionality, Sign) { ASSERT_EQ(1, Decimal(0).Sign()); } -TEST(DecimalTestFunctionality, GetWholeAndFraction) { - Decimal value("123456"); - - auto check = [value](int32_t scale, std::pair expected) { - int32_t out; - auto result = value.GetWholeAndFraction(scale); - ASSERT_THAT(result->first.ToInteger(&out), IsOk()); - ASSERT_EQ(expected.first, out); - ASSERT_THAT(result->second.ToInteger(&out), IsOk()); - ASSERT_EQ(expected.second, out); - }; - - check(0, {123456, 0}); - check(1, {12345, 6}); - check(5, {1, 23456}); - check(7, {0, 123456}); -} - -TEST(DecimalTestFunctionality, GetWholeAndFractionNegative) { - Decimal value("-123456"); - - auto check = [value](int32_t scale, std::pair expected) { - int32_t out; - auto result = value.GetWholeAndFraction(scale); - ASSERT_THAT(result->first.ToInteger(&out), IsOk()); - ASSERT_EQ(expected.first, out); - ASSERT_THAT(result->second.ToInteger(&out), IsOk()); - ASSERT_EQ(expected.second, out); - }; - - check(0, {-123456, 0}); - check(1, {-12345, -6}); - check(5, {-1, -23456}); - check(7, {0, -123456}); -} - TEST(DecimalTestFunctionality, FitsInPrecision) { ASSERT_TRUE(Decimal("0").FitsInPrecision(1)); ASSERT_TRUE(Decimal("9").FitsInPrecision(1)); From d1fe1863722baa49279a1086d70569b2694ded92 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 3 Sep 2025 23:41:35 +0800 Subject: [PATCH 18/20] chore: use uint128 for FromBigEndian --- src/iceberg/util/decimal.cc | 126 ++++++------------------------------ 1 file changed, 18 insertions(+), 108 deletions(-) diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index cb10d3f37..ba1ca6c32 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -48,14 +48,6 @@ namespace iceberg { namespace { -// Signed left shift with well-defined behaviour on negative numbers or overflow -template - requires std::is_signed_v && std::is_integral_v -constexpr SignedInt SafeLeftShift(SignedInt u, Shift bits) { - using UnsignedInt = std::make_unsigned_t; - return static_cast(static_cast(u) << bits); -} - struct DecimalComponents { std::string_view while_digits; std::string_view fractional_digits; @@ -205,7 +197,9 @@ inline void ShiftAndAdd(std::string_view input, uint128_t& out) { const uint64_t multiple = kUInt64PowersOfTen[group_size]; uint64_t value = 0; - std::from_chars(input.data() + pos, input.data() + pos + group_size, value); + auto [_, ec] = + std::from_chars(input.data() + pos, input.data() + pos + group_size, value); + ICEBERG_DCHECK(ec == std::errc(), "Failed to parse digits in ShiftAndAdd"); out = out * multiple + value; pos += group_size; @@ -471,73 +465,6 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, namespace { -constexpr float kFloatInf = std::numeric_limits::infinity(); - -// Attention: these pre-computed constants might not exactly represent their -// decimal counterparts: -// >>> int32_t(1e38) -// 99999999999999997748809823456034029568 - -constexpr int32_t kPrecomputedPowersOfTen = 76; - -constexpr std::array kFloatPowersOfTen = { - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1e-45f, 1e-44f, 1e-43f, 1e-42f, - 1e-41f, 1e-40f, 1e-39f, 1e-38f, 1e-37f, 1e-36f, 1e-35f, - 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f, 1e-28f, - 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f, - 1e-20f, 1e-19f, 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f, - 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f, 1e-8f, 1e-7f, - 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f, - 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, - 1e8f, 1e9f, 1e10f, 1e11f, 1e12f, 1e13f, 1e14f, - 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f, - 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f, - 1e29f, 1e30f, 1e31f, 1e32f, 1e33f, 1e34f, 1e35f, - 1e36f, 1e37f, 1e38f, kFloatInf, kFloatInf, kFloatInf, kFloatInf, - kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, - kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, - kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, - kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, - kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf}; - -constexpr std::array kDoublePowersOfTen = { - 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65, - 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53, - 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, - 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29, - 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, - 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, - 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, - 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, - 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, - 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43, - 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, - 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, - 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76}; - -// Helper function used by Decimal::FromBigEndian -static inline uint64_t UInt64FromBigEndian(const uint8_t* bytes, int32_t length) { - // We don't bounds check the length here because this is called by - // FromBigEndian that has a Decimal128 as its out parameters and - // that function is already checking the length of the bytes and only - // passes lengths between zero and eight. - uint64_t result = 0; - // Using memcpy instead of special casing for length - // and doing the conversion in 16, 32 parts, which could - // possibly create unaligned memory access on certain platforms - std::memcpy(reinterpret_cast(&result) + 8 - length, bytes, length); - - if constexpr (std::endian::native == std::endian::little) { - return std::byteswap(result); - } else { - return result; - } -} - static bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, const Decimal& multiplier, Decimal* result) { if (delta_scale < 0) { @@ -569,41 +496,24 @@ Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { // sign bit. const bool is_negative = static_cast(bytes[0]) < 0; - // 1. Extract the high bytes - // Stop byte of the high bytes - const int32_t high_bits_offset = std::max(0, length - 8); - const auto high_bits = UInt64FromBigEndian(bytes, high_bits_offset); + uint128_t result = 0; + std::memcpy(reinterpret_cast(&result) + 16 - length, bytes, length); - if (high_bits_offset == 8) { - // Avoid undefined shift by 64 below - high = high_bits; - } else { - high = -1 * (is_negative && length < kMaxDecimalBytes); - // Shift left enough bits to make room for the incoming int64_t - high = SafeLeftShift(high, high_bits_offset * CHAR_BIT); - // Preserve the upper bits by inplace OR-ing the int64_t - high |= high_bits; - } - - // 2. Extract the low bytes - // Stop byte of the low bytes - const int32_t low_bits_offset = std::min(length, 8); - const auto low_bits = - UInt64FromBigEndian(bytes + high_bits_offset, length - high_bits_offset); - - if (low_bits_offset == 8) { - // Avoid undefined shift by 64 below - low = low_bits; - } else { - // Sign extend the low bits if necessary - low = -1 * (is_negative && length < 8); - // Shift left enough bits to make room for the incoming int64_t - low = SafeLeftShift(low, low_bits_offset * CHAR_BIT); - // Preserve the upper bits by inplace OR-ing the int64_t - low |= low_bits; + if constexpr (std::endian::native == std::endian::little) { + auto high = static_cast(result >> 64); + auto low = static_cast(result); + high = std::byteswap(high); + low = std::byteswap(low); + // also need to swap the two halves + result = (static_cast(low) << 64) | high; + } + + if (is_negative && length < kMaxDecimalBytes) { + // Sign extend the high bits + result |= (static_cast(-1) << (length * CHAR_BIT)); } - return Decimal(high, static_cast(low)); + return Decimal(static_cast(result)); } Result Decimal::Rescale(int32_t orig_scale, int32_t new_scale) const { From cb22a3c965ec501997bcdd794850a67a2f777a85 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 4 Sep 2025 07:31:16 +0800 Subject: [PATCH 19/20] chore: fix minor issue --- src/iceberg/inheritable_metadata.cc | 1 - src/iceberg/util/decimal.cc | 31 ++++++++++++----------------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/src/iceberg/inheritable_metadata.cc b/src/iceberg/inheritable_metadata.cc index ae0920873..58eb28345 100644 --- a/src/iceberg/inheritable_metadata.cc +++ b/src/iceberg/inheritable_metadata.cc @@ -19,7 +19,6 @@ #include "iceberg/inheritable_metadata.h" -#include #include #include diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index ba1ca6c32..33e7ecdff 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -26,7 +26,6 @@ #include #include -#include #include #include #include @@ -270,6 +269,19 @@ void AdjustIntegerStringWithScale(int32_t scale, std::string* str) { str->at(is_negative_offset + 1) = '.'; } +bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, + const Decimal& multiplier, Decimal* result) { + if (delta_scale < 0) { + auto res = value.Divide(multiplier); + ICEBERG_DCHECK(res, "Decimal::Divide failed"); + *result = res->first; + return res->second != 0; + } + + *result = value * multiplier; + return (value < 0) ? *result > value : *result < value; +} + } // namespace Decimal::Decimal(std::string_view str) { @@ -463,23 +475,6 @@ Result Decimal::FromString(std::string_view str, int32_t* precision, return result; } -namespace { - -static bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, - const Decimal& multiplier, Decimal* result) { - if (delta_scale < 0) { - auto res = value.Divide(multiplier); - ICEBERG_DCHECK(res, "Decimal::Divide failed"); - *result = res->first; - return res->second != 0; - } - - *result = value * multiplier; - return (value < 0) ? *result > value : *result < value; -} - -} // namespace - Result Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { static constexpr int32_t kMinDecimalBytes = 1; static constexpr int32_t kMaxDecimalBytes = 16; From 081ef1f059f551a9a08c68bf282a64b190823b67 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sun, 7 Sep 2025 19:32:32 +0800 Subject: [PATCH 20/20] fix: review comments --- src/iceberg/util/decimal.cc | 10 +++------- src/iceberg/util/decimal.h | 10 +++++----- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/iceberg/util/decimal.cc b/src/iceberg/util/decimal.cc index 33e7ecdff..606bf2fdb 100644 --- a/src/iceberg/util/decimal.cc +++ b/src/iceberg/util/decimal.cc @@ -24,21 +24,18 @@ #include "iceberg/util/decimal.h" -#include #include #include #include #include -#include #include #include #include #include #include -#include -#include #include +#include "iceberg/exception.h" #include "iceberg/result.h" #include "iceberg/util/int128.h" #include "iceberg/util/macros.h" @@ -287,9 +284,8 @@ bool RescaleWouldCauseDataLoss(const Decimal& value, int32_t delta_scale, Decimal::Decimal(std::string_view str) { auto result = Decimal::FromString(str); if (!result) { - throw std::runtime_error( - std::format("Failed to parse Decimal from string: {}, error: {}", str, - result.error().message)); + throw IcebergError(std::format("Failed to parse Decimal from string: {}, error: {}", + str, result.error().message)); } *this = std::move(result.value()); } diff --git a/src/iceberg/util/decimal.h b/src/iceberg/util/decimal.h index 695bbbbf1..f118bde62 100644 --- a/src/iceberg/util/decimal.h +++ b/src/iceberg/util/decimal.h @@ -52,18 +52,18 @@ class ICEBERG_EXPORT Decimal : public util::Formattable { constexpr Decimal() noexcept = default; /// \brief Create a Decimal from a 128-bit integer. - constexpr Decimal(int128_t value) noexcept // NOLINT(google-explicit-constructor) + constexpr Decimal(int128_t value) noexcept // NOLINT implicit conversion : data_(value) {} /// \brief Create a Decimal from any integer not wider than 64 bits. template requires(std::is_integral_v && (sizeof(T) <= sizeof(uint64_t))) - constexpr Decimal(T value) noexcept // NOLINT(google-explicit-constructor) - { - data_ = static_cast(value); - } + constexpr Decimal(T value) noexcept // NOLINT implicit conversion + : data_(static_cast(value)) {} /// \brief Parse a Decimal from a string representation. + /// \throw This constructor throws an exception if parsing fails. Use + /// Decimal::FromString() if you want to handle errors more gracefully. explicit Decimal(std::string_view str); /// \brief Create a Decimal from two 64-bit integers.