Skip to content

Commit 9b4b7fa

Browse files
committed
feat: Literal support decimal & respect Appendix B: 32-bit Hash Requirements
1 parent 81bf29e commit 9b4b7fa

20 files changed

+1086
-243
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,14 @@ set(ICEBERG_SOURCES
5555
manifest_reader_internal.cc
5656
manifest_writer.cc
5757
arrow_c_data_guard_internal.cc
58+
util/bucket_util.cc
5859
util/conversions.cc
5960
util/decimal.cc
6061
util/gzip_internal.cc
6162
util/murmurhash3_internal.cc
63+
util/temporal_util.cc
6264
util/timepoint.cc
65+
util/truncate_util.cc
6366
util/uuid.cc)
6467

6568
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)

src/iceberg/expression/literal.cc

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,18 @@ Literal Literal::Fixed(std::vector<uint8_t> value) {
158158
return {Value{std::move(value)}, fixed(length)};
159159
}
160160

161+
Literal Literal::MakeDecimal(Decimal value, int32_t precision, int32_t scale) {
162+
return {Value{value}, decimal(precision, scale)};
163+
}
164+
165+
Result<Literal> Literal::MakeDecimal(std::string_view value) {
166+
int32_t precision = 0;
167+
int32_t scale = 0;
168+
ICEBERG_ASSIGN_OR_RAISE(auto decimal_value,
169+
Decimal::FromString(value, &precision, &scale));
170+
return Literal{Value{decimal_value}, decimal(precision, scale)};
171+
}
172+
161173
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
162174
std::shared_ptr<PrimitiveType> type) {
163175
return Conversions::FromBytes(std::move(type), data);
@@ -193,12 +205,44 @@ std::strong_ordering CompareFloat(T lhs, T rhs) {
193205
return lhs_is_negative <=> rhs_is_negative;
194206
}
195207

208+
std::strong_ordering CompareDecimal(Literal const& lhs, Literal const& rhs) {
209+
ICEBERG_DCHECK(std::holds_alternative<Decimal>(lhs.value()),
210+
"LHS of decimal comparison must hold Decimal");
211+
ICEBERG_DCHECK(std::holds_alternative<Decimal>(rhs.value()),
212+
"RHS of decimal comparison must hold decimal");
213+
const auto& lhs_type = std::dynamic_pointer_cast<DecimalType>(lhs.type());
214+
const auto& rhs_type = std::dynamic_pointer_cast<DecimalType>(rhs.type());
215+
ICEBERG_DCHECK(lhs_type != nullptr, "LHS type must be DecimalType");
216+
ICEBERG_DCHECK(rhs_type != nullptr, "RHS type must be DecimalType");
217+
auto lhs_decimal = std::get<Decimal>(lhs.value());
218+
auto rhs_decimal = std::get<Decimal>(rhs.value());
219+
if (lhs_type->scale() == rhs_type->scale()) {
220+
return lhs_decimal <=> rhs_decimal;
221+
} else if (lhs_type->scale() > rhs_type->scale()) {
222+
// Rescale to larger scale
223+
auto rhs_res = rhs_decimal.Rescale(rhs_type->scale(), lhs_type->scale());
224+
if (!rhs_res) {
225+
// Rescale would cause data loss, so lhs is definitely less than rhs
226+
return std::strong_ordering::less;
227+
}
228+
return lhs_decimal <=> rhs_res.value();
229+
} else {
230+
// Rescale to larger scale
231+
auto lhs_res = lhs_decimal.Rescale(lhs_type->scale(), rhs_type->scale());
232+
if (!lhs_res) {
233+
// Rescale would cause data loss, so lhs is definitely greater than rhs
234+
return std::strong_ordering::greater;
235+
}
236+
return lhs_res.value() <=> rhs_decimal;
237+
}
238+
}
239+
196240
bool Literal::operator==(const Literal& other) const { return (*this <=> other) == 0; }
197241

198242
// Three-way comparison operator
199243
std::partial_ordering Literal::operator<=>(const Literal& other) const {
200244
// If types are different, comparison is unordered
201-
if (*type_ != *other.type_) {
245+
if (type_->type_id() != other.type_->type_id()) {
202246
return std::partial_ordering::unordered;
203247
}
204248

@@ -247,6 +291,10 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const {
247291
return CompareFloat(this_val, other_val);
248292
}
249293

294+
case TypeId::kDecimal: {
295+
return CompareDecimal(*this, other);
296+
}
297+
250298
case TypeId::kString: {
251299
auto& this_val = std::get<std::string>(value_);
252300
auto& other_val = std::get<std::string>(other.value_);
@@ -307,6 +355,14 @@ std::string Literal::ToString() const {
307355
case TypeId::kDouble: {
308356
return std::to_string(std::get<double>(value_));
309357
}
358+
case TypeId::kDecimal: {
359+
auto decimal = std::get<Decimal>(value_);
360+
auto decimal_type = std::dynamic_pointer_cast<DecimalType>(type_);
361+
ICEBERG_DCHECK(decimal_type != nullptr, "Type must be DecimalType");
362+
auto result = decimal.ToString(decimal_type->scale());
363+
ICEBERG_CHECK(result, "Decimal ToString failed");
364+
return *result;
365+
}
310366
case TypeId::kString: {
311367
return std::get<std::string>(value_);
312368
}
@@ -331,7 +387,6 @@ std::string Literal::ToString() const {
331387
}
332388
return result;
333389
}
334-
case TypeId::kDecimal:
335390
case TypeId::kDate:
336391
case TypeId::kTime:
337392
case TypeId::kTimestamp:

src/iceberg/expression/literal.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
#include <compare>
2323
#include <memory>
2424
#include <string>
25+
#include <string_view>
2526
#include <variant>
2627
#include <vector>
2728

2829
#include "iceberg/result.h"
2930
#include "iceberg/type.h"
31+
#include "iceberg/util/decimal.h"
3032
#include "iceberg/util/formattable.h"
3133
#include "iceberg/util/uuid.h"
3234

@@ -56,10 +58,10 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
5658
int64_t, // for long, timestamp, timestamp_tz, time
5759
float, // for float
5860
double, // for double
61+
Decimal, // for decimal
5962
std::string, // for string
6063
Uuid, // for uuid
61-
std::vector<uint8_t>, // for binary, fixed
62-
std::array<uint8_t, 16>, // for decimal
64+
std::vector<uint8_t>, // for binary, fixed
6365
BelowMin, AboveMax>;
6466

6567
/// \brief Factory methods for primitive types
@@ -76,6 +78,8 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
7678
static Literal UUID(Uuid value);
7779
static Literal Binary(std::vector<uint8_t> value);
7880
static Literal Fixed(std::vector<uint8_t> value);
81+
static Literal MakeDecimal(Decimal value, int32_t precision, int32_t scale);
82+
static Result<Literal> MakeDecimal(std::string_view value);
7983

8084
/// \brief Create a literal representing a null value.
8185
static Literal Null(std::shared_ptr<PrimitiveType> type) {
@@ -205,6 +209,11 @@ struct LiteralTraits<TypeId::kDouble> {
205209
using ValueType = double;
206210
};
207211

212+
template <>
213+
struct LiteralTraits<TypeId::kDecimal> {
214+
using ValueType = Decimal;
215+
};
216+
208217
template <>
209218
struct LiteralTraits<TypeId::kString> {
210219
using ValueType = std::string;

src/iceberg/test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,13 @@ add_iceberg_test(json_serde_test
9999

100100
add_iceberg_test(util_test
101101
SOURCES
102+
bucket_util_test.cc
102103
config_test.cc
103104
decimal_test.cc
104105
endian_test.cc
105106
formatter_test.cc
106107
string_util_test.cc
108+
truncate_util_test.cc
107109
uuid_test.cc
108110
visit_type_test.cc)
109111

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/util/bucket_util.h"
21+
22+
#include <chrono>
23+
24+
#include <gtest/gtest.h>
25+
26+
namespace iceberg {
27+
28+
// The following tests are from
29+
// https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
30+
TEST(BucketUtilsTest, HashHelper) {
31+
// int and long
32+
EXPECT_EQ(BucketUtils::HashInt(34), 2017239379);
33+
EXPECT_EQ(BucketUtils::HashLong(34L), 2017239379);
34+
35+
// decimal hash
36+
auto decimal = Decimal::FromString("14.20");
37+
ASSERT_TRUE(decimal.has_value());
38+
EXPECT_EQ(BucketUtils::HashBytes(Decimal::ToBigEndian(decimal->value())), -500754589);
39+
40+
// date hash
41+
// 2017-11-16
42+
std::chrono::sys_days sd = std::chrono::year{2017} / 11 / 16;
43+
std::chrono::sys_days epoch{std::chrono::year{1970} / 1 / 1};
44+
int32_t days = (sd - epoch).count();
45+
std::cout << "days: " << days << std::endl;
46+
EXPECT_EQ(BucketUtils::HashInt(days), -653330422);
47+
48+
// time
49+
// 22:31:08 in microseconds
50+
int64_t time_micros = (22 * 3600 + 31 * 60 + 8) * 1000000LL;
51+
std::cout << "time micros: " << time_micros << std::endl;
52+
EXPECT_EQ(BucketUtils::HashLong(time_micros), -662762989);
53+
54+
// timestamp
55+
// 2017-11-16T22:31:08 in microseconds
56+
std::chrono::system_clock::time_point tp =
57+
std::chrono::sys_days{std::chrono::year{2017} / 11 / 16} + std::chrono::hours{22} +
58+
std::chrono::minutes{31} + std::chrono::seconds{8};
59+
int64_t timestamp_micros =
60+
std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch())
61+
.count();
62+
std::cout << "timestamp micros: " << timestamp_micros << std::endl;
63+
EXPECT_EQ(BucketUtils::HashLong(timestamp_micros), -2047944441);
64+
// 2017-11-16T22:31:08.000001 in microseconds
65+
EXPECT_EQ(BucketUtils::HashLong(timestamp_micros + 1), -1207196810);
66+
67+
// string
68+
std::string str = "iceberg";
69+
EXPECT_EQ(BucketUtils::HashBytes(std::span<const uint8_t>(
70+
reinterpret_cast<const uint8_t*>(str.data()), str.size())),
71+
1210000089);
72+
73+
// uuid
74+
// f79c3e09-677c-4bbd-a479-3f349cb785e7
75+
std::array<uint8_t, 16> uuid = {0xf7, 0x9c, 0x3e, 0x09, 0x67, 0x7c, 0x4b, 0xbd,
76+
0xa4, 0x79, 0x3f, 0x34, 0x9c, 0xb7, 0x85, 0xe7};
77+
EXPECT_EQ(BucketUtils::HashBytes(uuid), 1488055340);
78+
79+
// fixed & binary
80+
std::vector<uint8_t> fixed = {0, 1, 2, 3};
81+
EXPECT_EQ(BucketUtils::HashBytes(fixed), -188683207);
82+
}
83+
84+
} // namespace iceberg

src/iceberg/test/decimal_test.cc

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,50 @@ TEST(DecimalTest, FromBigEndianInvalid) {
490490
IsError(ErrorKind::kInvalidArgument));
491491
}
492492

493+
TEST(DecimalTest, ToBigEndian) {
494+
std::vector<int64_t> high_values = {0,
495+
1,
496+
-1,
497+
INT32_MAX,
498+
INT32_MIN,
499+
static_cast<int64_t>(INT32_MAX) + 1,
500+
static_cast<int64_t>(INT32_MIN) - 1,
501+
INT64_MAX,
502+
INT64_MIN};
503+
std::vector<uint64_t> low_values = {0,
504+
1,
505+
255,
506+
UINT32_MAX,
507+
static_cast<uint64_t>(UINT32_MAX) + 1,
508+
static_cast<uint64_t>(UINT32_MAX) + 2,
509+
static_cast<uint64_t>(UINT32_MAX) + 3,
510+
static_cast<uint64_t>(UINT32_MAX) + 4,
511+
static_cast<uint64_t>(UINT32_MAX) + 5,
512+
static_cast<uint64_t>(UINT32_MAX) + 6,
513+
static_cast<uint64_t>(UINT32_MAX) + 7,
514+
static_cast<uint64_t>(UINT32_MAX) + 8,
515+
UINT64_MAX};
516+
517+
for (int64_t high : high_values) {
518+
for (uint64_t low : low_values) {
519+
Decimal value(high, low);
520+
auto bytes = Decimal::ToBigEndian(value.value());
521+
auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
522+
ASSERT_THAT(result, IsOk());
523+
EXPECT_EQ(result.value(), value);
524+
}
525+
}
526+
527+
for (int128_t value : std::vector<int128_t>{-INT64_MAX, -INT32_MAX, -255, -1, 0, 1, 255,
528+
256, INT32_MAX, INT64_MAX}) {
529+
Decimal decimal(value);
530+
auto bytes = Decimal::ToBigEndian(decimal.value());
531+
auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
532+
ASSERT_THAT(result, IsOk());
533+
EXPECT_EQ(result.value(), decimal);
534+
}
535+
}
536+
493537
TEST(DecimalTestFunctionality, Multiply) {
494538
ASSERT_EQ(Decimal(60501), Decimal(301) * Decimal(201));
495539
ASSERT_EQ(Decimal(-60501), Decimal(-301) * Decimal(201));

0 commit comments

Comments
 (0)