Skip to content

Commit 6e5976c

Browse files
authored
feat: support decimal literal and refactor transform utilities (#238)
1 parent 95a51a1 commit 6e5976c

24 files changed

+1548
-557
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,14 @@ set(ICEBERG_SOURCES
5959
transform.cc
6060
transform_function.cc
6161
type.cc
62+
util/bucket_util.cc
6263
util/conversions.cc
6364
util/decimal.cc
6465
util/gzip_internal.cc
6566
util/murmurhash3_internal.cc
67+
util/temporal_util.cc
6668
util/timepoint.cc
69+
util/truncate_util.cc
6770
util/uuid.cc
6871
v1_metadata.cc
6972
v2_metadata.cc

src/iceberg/expression/literal.cc

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
#include <cstdint>
2525
#include <string>
2626

27-
#include "iceberg/type_fwd.h"
2827
#include "iceberg/util/checked_cast.h"
2928
#include "iceberg/util/conversions.h"
29+
#include "iceberg/util/macros.h"
3030

3131
namespace iceberg {
3232

@@ -188,11 +188,14 @@ Result<Literal> LiteralCaster::CastFromString(
188188
const auto& str_val = std::get<std::string>(literal.value_);
189189

190190
switch (target_type->type_id()) {
191+
case TypeId::kUuid: {
192+
ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(str_val));
193+
return Literal::UUID(uuid);
194+
}
191195
case TypeId::kDate:
192196
case TypeId::kTime:
193197
case TypeId::kTimestamp:
194198
case TypeId::kTimestampTz:
195-
case TypeId::kUuid:
196199
return NotImplemented("Cast from String to {} is not implemented yet",
197200
target_type->ToString());
198201
default:
@@ -296,6 +299,10 @@ Literal Literal::Fixed(std::vector<uint8_t> value) {
296299
return {Value{std::move(value)}, fixed(size)};
297300
}
298301

302+
Literal Literal::Decimal(int128_t value, int32_t precision, int32_t scale) {
303+
return {Value{::iceberg::Decimal(value)}, decimal(precision, scale)};
304+
}
305+
299306
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
300307
std::shared_ptr<PrimitiveType> type) {
301308
return Conversions::FromBytes(std::move(type), data);
@@ -385,6 +392,15 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const {
385392
return CompareFloat(this_val, other_val);
386393
}
387394

395+
case TypeId::kDecimal: {
396+
auto& this_val = std::get<::iceberg::Decimal>(value_);
397+
auto& other_val = std::get<::iceberg::Decimal>(other.value_);
398+
const auto& this_decimal_type = internal::checked_cast<DecimalType&>(*type_);
399+
const auto& other_decimal_type = internal::checked_cast<DecimalType&>(*other.type_);
400+
return ::iceberg::Decimal::Compare(this_val, other_val, this_decimal_type.scale(),
401+
other_decimal_type.scale());
402+
}
403+
388404
case TypeId::kString: {
389405
auto& this_val = std::get<std::string>(value_);
390406
auto& other_val = std::get<std::string>(other.value_);
@@ -440,6 +456,12 @@ std::string Literal::ToString() const {
440456
case TypeId::kDouble: {
441457
return std::to_string(std::get<double>(value_));
442458
}
459+
case TypeId::kDecimal: {
460+
const auto& decimal_type = internal::checked_cast<DecimalType&>(*type_);
461+
const auto& decimal = std::get<::iceberg::Decimal>(value_);
462+
return decimal.ToString(decimal_type.scale())
463+
.value_or("invalid literal of type decimal");
464+
}
443465
case TypeId::kString: {
444466
return "\"" + std::get<std::string>(value_) + "\"";
445467
}

src/iceberg/expression/literal.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727

2828
#include "iceberg/result.h"
2929
#include "iceberg/type.h"
30+
#include "iceberg/util/decimal.h"
3031
#include "iceberg/util/formattable.h"
32+
#include "iceberg/util/int128.h"
3133
#include "iceberg/util/uuid.h"
3234

3335
namespace iceberg {
@@ -57,9 +59,9 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
5759
float, // for float
5860
double, // for double
5961
std::string, // for string
60-
Uuid, // for uuid
61-
std::vector<uint8_t>, // for binary, fixed
62-
std::array<uint8_t, 16>, // for decimal
62+
std::vector<uint8_t>, // for binary, fixed
63+
::iceberg::Decimal, // for decimal
64+
Uuid, // for uuid
6365
BelowMin, AboveMax>;
6466

6567
/// \brief Factory methods for primitive types
@@ -77,6 +79,10 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
7779
static Literal Binary(std::vector<uint8_t> value);
7880
static Literal Fixed(std::vector<uint8_t> value);
7981

82+
/// \brief Create a decimal literal.
83+
/// \param value The unscaled 128-bit integer value.
84+
static Literal Decimal(int128_t value, int32_t precision, int32_t scale);
85+
8086
/// \brief Create a literal representing a null value.
8187
static Literal Null(std::shared_ptr<PrimitiveType> type) {
8288
return {Value{std::monostate{}}, std::move(type)};
@@ -205,6 +211,11 @@ struct LiteralTraits<TypeId::kDouble> {
205211
using ValueType = double;
206212
};
207213

214+
template <>
215+
struct LiteralTraits<TypeId::kDecimal> {
216+
using ValueType = Decimal;
217+
};
218+
208219
template <>
209220
struct LiteralTraits<TypeId::kString> {
210221
using ValueType = std::string;

src/iceberg/manifest_adapter.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,12 @@ Status ManifestEntryAdapter::AppendPartitionValues(
220220
break;
221221
case TypeId::kDecimal:
222222
ICEBERG_RETURN_UNEXPECTED(AppendField(
223-
child_array, std::get<std::array<uint8_t, 16>>(partition_value.value())));
223+
child_array, std::get<Decimal>(partition_value.value()).ToBytes()));
224224
break;
225225
case TypeId::kUuid:
226+
ICEBERG_RETURN_UNEXPECTED(
227+
AppendField(child_array, std::get<Uuid>(partition_value.value()).bytes()));
228+
break;
226229
case TypeId::kStruct:
227230
case TypeId::kList:
228231
case TypeId::kMap:

src/iceberg/meson.build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,14 @@ iceberg_sources = files(
8181
'transform.cc',
8282
'transform_function.cc',
8383
'type.cc',
84+
'util/bucket_util.cc',
8485
'util/conversions.cc',
8586
'util/decimal.cc',
8687
'util/gzip_internal.cc',
8788
'util/murmurhash3_internal.cc',
89+
'util/temporal_util.cc',
8890
'util/timepoint.cc',
91+
'util/truncate_util.cc',
8992
'util/uuid.cc',
9093
'v1_metadata.cc',
9194
'v2_metadata.cc',

src/iceberg/test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,13 @@ add_iceberg_test(json_serde_test
9999

100100
add_iceberg_test(util_test
101101
SOURCES
102+
bucket_util_test.cc
102103
config_test.cc
103104
decimal_test.cc
104105
endian_test.cc
105106
formatter_test.cc
106107
string_util_test.cc
108+
truncate_util_test.cc
107109
uuid_test.cc
108110
visit_type_test.cc)
109111

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/util/bucket_util.h"
21+
22+
#include <chrono>
23+
24+
#include <gtest/gtest.h>
25+
26+
#include "iceberg/util/decimal.h"
27+
#include "iceberg/util/uuid.h"
28+
29+
namespace iceberg {
30+
31+
// The following tests are from
32+
// https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
33+
TEST(BucketUtilsTest, HashHelper) {
34+
// int and long
35+
EXPECT_EQ(BucketUtils::HashInt(34), 2017239379);
36+
EXPECT_EQ(BucketUtils::HashLong(34L), 2017239379);
37+
38+
// decimal hash
39+
auto decimal = Decimal::FromString("14.20");
40+
ASSERT_TRUE(decimal.has_value());
41+
EXPECT_EQ(BucketUtils::HashBytes(decimal->ToBigEndian()), -500754589);
42+
43+
// date hash
44+
std::chrono::sys_days sd = std::chrono::year{2017} / 11 / 16;
45+
std::chrono::sys_days epoch{std::chrono::year{1970} / 1 / 1};
46+
int32_t days = (sd - epoch).count();
47+
EXPECT_EQ(BucketUtils::HashInt(days), -653330422);
48+
49+
// time
50+
// 22:31:08 in microseconds
51+
int64_t time_micros = (22 * 3600 + 31 * 60 + 8) * 1000000LL;
52+
EXPECT_EQ(BucketUtils::HashLong(time_micros), -662762989);
53+
54+
// timestamp
55+
// 2017-11-16T22:31:08 in microseconds
56+
std::chrono::system_clock::time_point tp =
57+
std::chrono::sys_days{std::chrono::year{2017} / 11 / 16} + std::chrono::hours{22} +
58+
std::chrono::minutes{31} + std::chrono::seconds{8};
59+
int64_t timestamp_micros =
60+
std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch())
61+
.count();
62+
EXPECT_EQ(BucketUtils::HashLong(timestamp_micros), -2047944441);
63+
// 2017-11-16T22:31:08.000001 in microseconds
64+
EXPECT_EQ(BucketUtils::HashLong(timestamp_micros + 1), -1207196810);
65+
66+
// string
67+
std::string str = "iceberg";
68+
EXPECT_EQ(BucketUtils::HashBytes(std::span<const uint8_t>(
69+
reinterpret_cast<const uint8_t*>(str.data()), str.size())),
70+
1210000089);
71+
72+
// uuid
73+
auto uuid = Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7");
74+
EXPECT_EQ(BucketUtils::HashBytes(uuid->bytes()), 1488055340);
75+
76+
// fixed & binary
77+
std::vector<uint8_t> fixed = {0, 1, 2, 3};
78+
EXPECT_EQ(BucketUtils::HashBytes(fixed), -188683207);
79+
}
80+
81+
} // namespace iceberg

src/iceberg/test/decimal_test.cc

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,50 @@ TEST(DecimalTest, FromBigEndianInvalid) {
490490
IsError(ErrorKind::kInvalidArgument));
491491
}
492492

493+
TEST(DecimalTest, ToBigEndian) {
494+
std::vector<int64_t> high_values = {0,
495+
1,
496+
-1,
497+
INT32_MAX,
498+
INT32_MIN,
499+
static_cast<int64_t>(INT32_MAX) + 1,
500+
static_cast<int64_t>(INT32_MIN) - 1,
501+
INT64_MAX,
502+
INT64_MIN};
503+
std::vector<uint64_t> low_values = {0,
504+
1,
505+
255,
506+
UINT32_MAX,
507+
static_cast<uint64_t>(UINT32_MAX) + 1,
508+
static_cast<uint64_t>(UINT32_MAX) + 2,
509+
static_cast<uint64_t>(UINT32_MAX) + 3,
510+
static_cast<uint64_t>(UINT32_MAX) + 4,
511+
static_cast<uint64_t>(UINT32_MAX) + 5,
512+
static_cast<uint64_t>(UINT32_MAX) + 6,
513+
static_cast<uint64_t>(UINT32_MAX) + 7,
514+
static_cast<uint64_t>(UINT32_MAX) + 8,
515+
UINT64_MAX};
516+
517+
for (int64_t high : high_values) {
518+
for (uint64_t low : low_values) {
519+
Decimal decimal(high, low);
520+
auto bytes = decimal.ToBigEndian();
521+
auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
522+
ASSERT_THAT(result, IsOk());
523+
EXPECT_EQ(result.value(), decimal);
524+
}
525+
}
526+
527+
for (int128_t value : std::vector<int128_t>{-INT64_MAX, -INT32_MAX, -255, -1, 0, 1, 255,
528+
256, INT32_MAX, INT64_MAX}) {
529+
Decimal decimal(value);
530+
auto bytes = decimal.ToBigEndian();
531+
auto result = Decimal::FromBigEndian(bytes.data(), bytes.size());
532+
ASSERT_THAT(result, IsOk());
533+
EXPECT_EQ(result.value(), decimal);
534+
}
535+
}
536+
493537
TEST(DecimalTestFunctionality, Multiply) {
494538
ASSERT_EQ(Decimal(60501), Decimal(301) * Decimal(201));
495539
ASSERT_EQ(Decimal(-60501), Decimal(-301) * Decimal(201));
@@ -671,4 +715,58 @@ TEST(DecimalTest, Rescale) {
671715
ASSERT_THAT(Decimal(5555555).Rescale(6, 1), IsError(ErrorKind::kInvalid));
672716
}
673717

718+
TEST(DecimalTest, Compare) {
719+
// max positive unscaled value
720+
// 10^38 - 1 scale cause overflow
721+
ASSERT_EQ(Decimal::Compare(Decimal("99999999999999999999999999999999999999"),
722+
Decimal("99999999999999999999999999999999999999"), 2, 3),
723+
std::partial_ordering::greater);
724+
// 10^37 - 1 scale no overflow
725+
ASSERT_EQ(Decimal::Compare(Decimal("9999999999999999999999999999999999999"),
726+
Decimal("99999999999999999999999999999999999999"), 2, 3),
727+
std::partial_ordering::less);
728+
729+
// min negative unscaled value
730+
// -10^38 + 1 scale cause overflow
731+
ASSERT_EQ(Decimal::Compare(Decimal("-99999999999999999999999999999999999999"),
732+
Decimal("-99999999999999999999999999999999999999"), 2, 3),
733+
std::partial_ordering::less);
734+
// -10^37 + 1 scale no overflow
735+
ASSERT_EQ(Decimal::Compare(Decimal("-9999999999999999999999999999999999999"),
736+
Decimal("-99999999999999999999999999999999999999"), 2, 3),
737+
std::partial_ordering::greater);
738+
739+
// equal values with different scales
740+
ASSERT_EQ(Decimal::Compare(Decimal("123456789"), Decimal("1234567890"), 2, 3),
741+
std::partial_ordering::equivalent);
742+
ASSERT_EQ(Decimal::Compare(Decimal("-1234567890"), Decimal("-123456789"), 3, 2),
743+
std::partial_ordering::equivalent);
744+
745+
// different values with different scales
746+
ASSERT_EQ(Decimal::Compare(Decimal("123456788"), Decimal("1234567890"), 2, 3),
747+
std::partial_ordering::less);
748+
ASSERT_EQ(Decimal::Compare(Decimal("-1234567890"), Decimal("-123456788"), 2, 3),
749+
std::partial_ordering::less);
750+
751+
// different values with same scales
752+
ASSERT_EQ(Decimal::Compare(Decimal("123456790"), Decimal("123456789"), 2, 2),
753+
std::partial_ordering::greater);
754+
ASSERT_EQ(Decimal::Compare(Decimal("-123456790"), Decimal("-123456789"), 2, 2),
755+
std::partial_ordering::less);
756+
757+
// different signs
758+
ASSERT_EQ(Decimal::Compare(Decimal("123456789"), Decimal("-123456789"), 2, 3),
759+
std::partial_ordering::greater);
760+
ASSERT_EQ(Decimal::Compare(Decimal("-123456789"), Decimal("123456789"), 2, 3),
761+
std::partial_ordering::less);
762+
763+
// zero comparisons
764+
ASSERT_EQ(Decimal::Compare(Decimal("0"), Decimal("0"), 2, 3),
765+
std::partial_ordering::equivalent);
766+
ASSERT_EQ(Decimal::Compare(Decimal("0"), Decimal("123456789"), 2, 3),
767+
std::partial_ordering::less);
768+
ASSERT_EQ(Decimal::Compare(Decimal("-123456789"), Decimal("0"), 2, 3),
769+
std::partial_ordering::less);
770+
}
771+
674772
} // namespace iceberg

0 commit comments

Comments
 (0)