Skip to content

Commit 5e9bb1b

Browse files
committed
feat: Literal support decimal & respect Appendix B: 32-bit Hash Requirements
1 parent 064d53b commit 5e9bb1b

23 files changed

+1042
-237
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,14 @@ set(ICEBERG_SOURCES
5555
manifest_reader_internal.cc
5656
manifest_writer.cc
5757
arrow_c_data_guard_internal.cc
58+
util/bucket_util.cc
5859
util/conversions.cc
5960
util/decimal.cc
6061
util/gzip_internal.cc
6162
util/murmurhash3_internal.cc
63+
util/temporal_util.cc
6264
util/timepoint.cc
65+
util/truncate_util.cc
6366
util/uuid.cc)
6467

6568
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)

src/iceberg/expression/literal.cc

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@
2424
#include <cstdint>
2525
#include <string>
2626

27-
#include "iceberg/type_fwd.h"
27+
#include "iceberg/exception.h"
2828
#include "iceberg/util/checked_cast.h"
2929
#include "iceberg/util/conversions.h"
30+
#include "iceberg/util/decimal.h"
31+
#include "iceberg/util/macros.h"
3032

3133
namespace iceberg {
3234

@@ -296,6 +298,10 @@ Literal Literal::Fixed(std::vector<uint8_t> value) {
296298
return {Value{std::move(value)}, fixed(size)};
297299
}
298300

301+
Literal Literal::Decimal(int128_t value, int32_t precision, int32_t scale) {
302+
return {Value{::iceberg::Decimal(value)}, decimal(precision, scale)};
303+
}
304+
299305
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
300306
std::shared_ptr<PrimitiveType> type) {
301307
return Conversions::FromBytes(std::move(type), data);
@@ -331,12 +337,42 @@ std::strong_ordering CompareFloat(T lhs, T rhs) {
331337
return lhs_is_negative <=> rhs_is_negative;
332338
}
333339

340+
std::strong_ordering CompareDecimal(Literal const& lhs, Literal const& rhs) {
341+
ICEBERG_DCHECK(std::holds_alternative<Decimal>(lhs.value()),
342+
"LHS of decimal comparison must hold Decimal");
343+
ICEBERG_DCHECK(std::holds_alternative<Decimal>(rhs.value()),
344+
"RHS of decimal comparison must hold decimal");
345+
const auto& lhs_type = std::dynamic_pointer_cast<DecimalType>(lhs.type());
346+
const auto& rhs_type = std::dynamic_pointer_cast<DecimalType>(rhs.type());
347+
auto lhs_decimal = std::get<Decimal>(lhs.value());
348+
auto rhs_decimal = std::get<Decimal>(rhs.value());
349+
if (lhs_type->scale() == rhs_type->scale()) {
350+
return lhs_decimal <=> rhs_decimal;
351+
} else if (lhs_type->scale() > rhs_type->scale()) {
352+
// Rescale to larger scale
353+
auto rhs_res = rhs_decimal.Rescale(rhs_type->scale(), lhs_type->scale());
354+
if (!rhs_res) {
355+
// Rescale would cause data loss, so lhs is definitely less than rhs
356+
return std::strong_ordering::less;
357+
}
358+
return lhs_decimal <=> rhs_res.value();
359+
} else {
360+
// Rescale to larger scale
361+
auto lhs_res = lhs_decimal.Rescale(lhs_type->scale(), rhs_type->scale());
362+
if (!lhs_res) {
363+
// Rescale would cause data loss, so lhs is definitely greater than rhs
364+
return std::strong_ordering::greater;
365+
}
366+
return lhs_res.value() <=> rhs_decimal;
367+
}
368+
}
369+
334370
bool Literal::operator==(const Literal& other) const { return (*this <=> other) == 0; }
335371

336372
// Three-way comparison operator
337373
std::partial_ordering Literal::operator<=>(const Literal& other) const {
338374
// If types are different, comparison is unordered
339-
if (*type_ != *other.type_) {
375+
if (type_->type_id() != other.type_->type_id()) {
340376
return std::partial_ordering::unordered;
341377
}
342378

@@ -385,6 +421,10 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const {
385421
return CompareFloat(this_val, other_val);
386422
}
387423

424+
case TypeId::kDecimal: {
425+
return CompareDecimal(*this, other);
426+
}
427+
388428
case TypeId::kString: {
389429
auto& this_val = std::get<std::string>(value_);
390430
auto& other_val = std::get<std::string>(other.value_);
@@ -440,6 +480,13 @@ std::string Literal::ToString() const {
440480
case TypeId::kDouble: {
441481
return std::to_string(std::get<double>(value_));
442482
}
483+
case TypeId::kDecimal: {
484+
auto decimal_type = internal::checked_pointer_cast<DecimalType>(type_);
485+
auto decimal = std::get<::iceberg::Decimal>(value_);
486+
auto result = decimal.ToString(decimal_type->scale());
487+
ICEBERG_CHECK(result, "Decimal ToString failed");
488+
return *result;
489+
}
443490
case TypeId::kString: {
444491
return "\"" + std::get<std::string>(value_) + "\"";
445492
}

src/iceberg/expression/literal.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727

2828
#include "iceberg/result.h"
2929
#include "iceberg/type.h"
30+
#include "iceberg/util/decimal.h"
3031
#include "iceberg/util/formattable.h"
32+
#include "iceberg/util/int128.h"
3133
#include "iceberg/util/uuid.h"
3234

3335
namespace iceberg {
@@ -56,10 +58,10 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
5658
int64_t, // for long, timestamp, timestamp_tz, time
5759
float, // for float
5860
double, // for double
59-
std::string, // for string
60-
Uuid, // for uuid
61-
std::vector<uint8_t>, // for binary, fixed
62-
std::array<uint8_t, 16>, // for decimal
61+
::iceberg::Decimal, // for decimal
62+
std::string, // for string
63+
Uuid, // for uuid
64+
std::vector<uint8_t>, // for binary, fixed
6365
BelowMin, AboveMax>;
6466

6567
/// \brief Factory methods for primitive types
@@ -77,6 +79,10 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
7779
static Literal Binary(std::vector<uint8_t> value);
7880
static Literal Fixed(std::vector<uint8_t> value);
7981

82+
/// \brief Create a decimal literal.
83+
/// \param value The unscaled 128-bit integer value.
84+
static Literal Decimal(int128_t value, int32_t precision, int32_t scale);
85+
8086
/// \brief Create a literal representing a null value.
8187
static Literal Null(std::shared_ptr<PrimitiveType> type) {
8288
return {Value{std::monostate{}}, std::move(type)};
@@ -205,6 +211,11 @@ struct LiteralTraits<TypeId::kDouble> {
205211
using ValueType = double;
206212
};
207213

214+
template <>
215+
struct LiteralTraits<TypeId::kDecimal> {
216+
using ValueType = Decimal;
217+
};
218+
208219
template <>
209220
struct LiteralTraits<TypeId::kString> {
210221
using ValueType = std::string;

src/iceberg/meson.build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,14 @@ iceberg_sources = files(
7777
'transform.cc',
7878
'transform_function.cc',
7979
'type.cc',
80+
'util/bucket_util.cc',
8081
'util/conversions.cc',
8182
'util/decimal.cc',
8283
'util/gzip_internal.cc',
8384
'util/murmurhash3_internal.cc',
85+
'util/temporal_util.cc',
8486
'util/timepoint.cc',
87+
'util/truncate_util.cc',
8588
'util/uuid.cc',
8689
)
8790

src/iceberg/test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,13 @@ add_iceberg_test(json_serde_test
9999

100100
add_iceberg_test(util_test
101101
SOURCES
102+
bucket_util_test.cc
102103
config_test.cc
103104
decimal_test.cc
104105
endian_test.cc
105106
formatter_test.cc
106107
string_util_test.cc
108+
truncate_util_test.cc
107109
uuid_test.cc
108110
visit_type_test.cc)
109111

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/util/bucket_util.h"
21+
22+
#include <chrono>
23+
24+
#include <gtest/gtest.h>
25+
26+
#include "iceberg/util/decimal.h"
27+
#include "iceberg/util/uuid.h"
28+
29+
namespace iceberg {
30+
31+
// The following tests are from
32+
// https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
33+
TEST(BucketUtilsTest, HashHelper) {
34+
// int and long
35+
EXPECT_EQ(BucketUtils::HashInt(34), 2017239379);
36+
EXPECT_EQ(BucketUtils::HashLong(34L), 2017239379);
37+
38+
// decimal hash
39+
auto decimal = Decimal::FromString("14.20");
40+
ASSERT_TRUE(decimal.has_value());
41+
EXPECT_EQ(BucketUtils::HashBytes(decimal->ToBigEndian()), -500754589);
42+
43+
// date hash
44+
std::chrono::sys_days sd = std::chrono::year{2017} / 11 / 16;
45+
std::chrono::sys_days epoch{std::chrono::year{1970} / 1 / 1};
46+
int32_t days = (sd - epoch).count();
47+
EXPECT_EQ(BucketUtils::HashInt(days), -653330422);
48+
49+
// time
50+
// 22:31:08 in microseconds
51+
int64_t time_micros = (22 * 3600 + 31 * 60 + 8) * 1000000LL;
52+
EXPECT_EQ(BucketUtils::HashLong(time_micros), -662762989);
53+
54+
// timestamp
55+
// 2017-11-16T22:31:08 in microseconds
56+
std::chrono::system_clock::time_point tp =
57+
std::chrono::sys_days{std::chrono::year{2017} / 11 / 16} + std::chrono::hours{22} +
58+
std::chrono::minutes{31} + std::chrono::seconds{8};
59+
int64_t timestamp_micros =
60+
std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch())
61+
.count();
62+
EXPECT_EQ(BucketUtils::HashLong(timestamp_micros), -2047944441);
63+
// 2017-11-16T22:31:08.000001 in microseconds
64+
EXPECT_EQ(BucketUtils::HashLong(timestamp_micros + 1), -1207196810);
65+
66+
// string
67+
std::string str = "iceberg";
68+
EXPECT_EQ(BucketUtils::HashBytes(std::span<const uint8_t>(
69+
reinterpret_cast<const uint8_t*>(str.data()), str.size())),
70+
1210000089);
71+
72+
// uuid
73+
auto uuid = Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7");
74+
EXPECT_EQ(BucketUtils::HashBytes(uuid->bytes()), 1488055340);
75+
76+
// fixed & binary
77+
std::vector<uint8_t> fixed = {0, 1, 2, 3};
78+
EXPECT_EQ(BucketUtils::HashBytes(fixed), -188683207);
79+
}
80+
81+
} // namespace iceberg

src/iceberg/test/decimal_test.cc

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,50 @@ TEST(DecimalTest, FromBigEndianInvalid) {
490490
IsError(ErrorKind::kInvalidArgument));
491491
}
492492

493+
TEST(DecimalTest, ToBigEndian) {
494+
std::vector<int64_t> high_values = {0,
495+
1,
496+
-1,
497+
INT32_MAX,
498+
INT32_MIN,
499+
static_cast<int64_t>(INT32_MAX) + 1,
500+
static_cast<int64_t>(INT32_MIN) - 1,
501+
INT64_MAX,
502+
INT64_MIN};
503+
std::vector<uint64_t> low_values = {0,
504+
1,
505+
255,
506+
UINT32_MAX,
507+
static_cast<uint64_t>(UINT32_MAX) + 1,
508+
static_cast<uint64_t>(UINT32_MAX) + 2,
509+
static_cast<uint64_t>(UINT32_MAX) + 3,
510+
static_cast<uint64_t>(UINT32_MAX) + 4,
511+
static_cast<uint64_t>(UINT32_MAX) + 5,
512+
static_cast<uint64_t>(UINT32_MAX) + 6,
513+
static_cast<uint64_t>(UINT32_MAX) + 7,
514+
static_cast<uint64_t>(UINT32_MAX) + 8,
515+
UINT64_MAX};
516+
517+
for (int64_t high : high_values) {
518+
for (uint64_t low : low_values) {
519+
Decimal decimal(high, low);
520+
auto bytes = decimal.ToBigEndian();
521+
auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
522+
ASSERT_THAT(result, IsOk());
523+
EXPECT_EQ(result.value(), decimal);
524+
}
525+
}
526+
527+
for (int128_t value : std::vector<int128_t>{-INT64_MAX, -INT32_MAX, -255, -1, 0, 1, 255,
528+
256, INT32_MAX, INT64_MAX}) {
529+
Decimal decimal(value);
530+
auto bytes = decimal.ToBigEndian();
531+
auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
532+
ASSERT_THAT(result, IsOk());
533+
EXPECT_EQ(result.value(), decimal);
534+
}
535+
}
536+
493537
TEST(DecimalTestFunctionality, Multiply) {
494538
ASSERT_EQ(Decimal(60501), Decimal(301) * Decimal(201));
495539
ASSERT_EQ(Decimal(-60501), Decimal(-301) * Decimal(201));

src/iceberg/test/literal_test.cc

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,20 @@ TEST(LiteralTest, DoubleZeroComparison) {
256256
EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less);
257257
}
258258

259+
TEST(LiteralTest, UuidComparison) {
260+
auto uuid1 = Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value();
261+
auto uuid2 = Uuid::FromString("123e4567-e89b-12d3-a456-426614174001").value();
262+
auto uuid3 = Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value();
263+
264+
auto literal1 = Literal::UUID(uuid1);
265+
auto literal2 = Literal::UUID(uuid2);
266+
auto literal3 = Literal::UUID(uuid3);
267+
268+
EXPECT_EQ(literal1 <=> literal3, std::partial_ordering::equivalent);
269+
EXPECT_EQ(literal1 <=> literal2, std::partial_ordering::unordered);
270+
EXPECT_EQ(literal2 <=> literal1, std::partial_ordering::unordered);
271+
}
272+
259273
// Parameter struct for literal serialization and deserialization tests
260274
struct LiteralParam {
261275
std::string test_name;
@@ -346,6 +360,17 @@ INSTANTIATE_TEST_SUITE_P(
346360
Literal::Double(std::numeric_limits<double>::lowest()),
347361
float64()},
348362

363+
// Decimal type
364+
LiteralParam{"DecimalPositive",
365+
{1, 226, 64},
366+
Literal::Decimal(123456, 6, 2),
367+
decimal(6, 2)},
368+
LiteralParam{"DecimalNegative",
369+
{254, 29, 192},
370+
Literal::Decimal(-123456, 6, 2),
371+
decimal(6, 2)},
372+
LiteralParam{"DecimalZero", {0}, Literal::Decimal(0, 3, 0), decimal(3, 0)},
373+
349374
LiteralParam{"String",
350375
{105, 99, 101, 98, 101, 114, 103},
351376
Literal::String("iceberg"),
@@ -506,10 +531,28 @@ INSTANTIATE_TEST_SUITE_P(
506531
.literal = Literal::Double(std::numbers::pi),
507532
.expected_type_id = TypeId::kDouble,
508533
.expected_string = "3.141593"},
534+
BasicLiteralTestParam{.test_name = "DecimalPositive",
535+
.literal = Literal::Decimal(123456, 6, 2),
536+
.expected_type_id = TypeId::kDecimal,
537+
.expected_string = "1234.56"},
538+
BasicLiteralTestParam{.test_name = "DecimalNegative",
539+
.literal = Literal::Decimal(-123456, 6, 2),
540+
.expected_type_id = TypeId::kDecimal,
541+
.expected_string = "-1234.56"},
542+
BasicLiteralTestParam{.test_name = "DecimalZero",
543+
.literal = Literal::Decimal(0, 3, 0),
544+
.expected_type_id = TypeId::kDecimal,
545+
.expected_string = "0"},
509546
BasicLiteralTestParam{.test_name = "String",
510547
.literal = Literal::String("hello world"),
511548
.expected_type_id = TypeId::kString,
512549
.expected_string = "\"hello world\""},
550+
BasicLiteralTestParam{
551+
.test_name = "Uuid",
552+
.literal = Literal::UUID(
553+
Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value()),
554+
.expected_type_id = TypeId::kUuid,
555+
.expected_string = "123e4567-e89b-12d3-a456-426614174000"},
513556
BasicLiteralTestParam{
514557
.test_name = "Binary",
515558
.literal = Literal::Binary(std::vector<uint8_t>{0x01, 0x02, 0x03, 0xFF}),
@@ -563,6 +606,10 @@ INSTANTIATE_TEST_SUITE_P(
563606
.small_literal = Literal::Double(1.5),
564607
.large_literal = Literal::Double(2.5),
565608
.equal_literal = Literal::Double(1.5)},
609+
ComparisonLiteralTestParam{.test_name = "Decimal",
610+
.small_literal = Literal::Decimal(123456, 6, 2),
611+
.large_literal = Literal::Decimal(234567, 6, 2),
612+
.equal_literal = Literal::Decimal(123456, 6, 2)},
566613
ComparisonLiteralTestParam{.test_name = "String",
567614
.small_literal = Literal::String("apple"),
568615
.large_literal = Literal::String("banana"),

0 commit comments

Comments
 (0)