Skip to content

Commit d2af530

Browse files
committed
fix: respect Appendix B: 32-bit Hash Requirements
1 parent dc8f229 commit d2af530

File tree

6 files changed

+176
-24
lines changed

6 files changed

+176
-24
lines changed

src/iceberg/expression/literal.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,15 @@ Result<Literal> Literal::Decimal(std::string_view value) {
166166
return Literal{Value{decimal_value.value()}, decimal(precision, scale)};
167167
}
168168

169+
Literal Literal::UUID(std::array<uint8_t, 16> value) {
170+
return {Value{std::move(value)}, uuid()};
171+
}
172+
173+
Literal Literal::Fixed(std::vector<uint8_t> value) {
174+
auto length = static_cast<int32_t>(value.size());
175+
return {Value{std::move(value)}, fixed(length)};
176+
}
177+
169178
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
170179
std::shared_ptr<PrimitiveType> type) {
171180
Literal::Value value;

src/iceberg/expression/literal.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919

2020
#pragma once
2121

22+
#include <array>
2223
#include <compare>
24+
#include <cstdint>
2325
#include <memory>
2426
#include <string>
2527
#include <string_view>
@@ -76,6 +78,8 @@ class ICEBERG_EXPORT Literal {
7678
static Literal Binary(std::vector<uint8_t> value);
7779
static Literal Decimal(int128_t value, int32_t precision, int32_t scale);
7880
static Result<Literal> Decimal(std::string_view value);
81+
static Literal UUID(std::array<uint8_t, 16> value);
82+
static Literal Fixed(std::vector<uint8_t> value);
7983

8084
/// \brief Create a literal representing a null value.
8185
static Literal Null(std::shared_ptr<PrimitiveType> type) {

src/iceberg/transform_function.cc

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@
2727

2828
#include "iceberg/expression/literal.h"
2929
#include "iceberg/type.h"
30+
#include "iceberg/type_fwd.h"
31+
#include "iceberg/util/decimal.h"
32+
#include "iceberg/util/endian.h"
3033
#include "iceberg/util/int128.h"
34+
#include "iceberg/util/macros.h"
3135
#include "iceberg/util/murmurhash3_internal.h"
3236
#include "iceberg/util/truncate_util.h"
3337

@@ -54,7 +58,8 @@ BucketTransform::BucketTransform(std::shared_ptr<Type> const& source_type,
5458
: TransformFunction(TransformType::kBucket, source_type), num_buckets_(num_buckets) {}
5559

5660
Result<Literal> BucketTransform::Transform(const Literal& literal) {
57-
assert(literal.type() == source_type());
61+
ICEBERG_DCHECK(*literal.type() == *source_type(),
62+
"Literal type must match source type");
5863
if (literal.IsBelowMin() || literal.IsAboveMax()) {
5964
return InvalidArgument(
6065
"Cannot apply bucket transform to literal with value {} of type {}",
@@ -69,17 +74,18 @@ Result<Literal> BucketTransform::Transform(const Literal& literal) {
6974
[&](auto&& value) {
7075
using T = std::decay_t<decltype(value)>;
7176
if constexpr (std::is_same_v<T, int32_t>) {
72-
MurmurHash3_x86_32(&value, sizeof(int32_t), 0, &hash_value);
77+
hash_value = HashInt(value);
7378
} else if constexpr (std::is_same_v<T, int64_t>) {
74-
MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value);
79+
hash_value = HashLong(value);
7580
} else if constexpr (std::is_same_v<T, std::array<uint8_t, 16>>) {
76-
MurmurHash3_x86_32(value.data(), sizeof(uint8_t) * 16, 0, &hash_value);
81+
hash_value = HashBytes(value);
7782
} else if constexpr (std::is_same_v<T, int128_t>) {
78-
MurmurHash3_x86_32(&value, sizeof(int128_t), 0, &hash_value);
83+
hash_value = HashBytes(Decimal::ToBigEndian(value));
7984
} else if constexpr (std::is_same_v<T, std::string>) {
80-
MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value);
85+
hash_value = HashBytes(std::span<const uint8_t>(
86+
reinterpret_cast<const uint8_t*>(value.data()), value.size()));
8187
} else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
82-
MurmurHash3_x86_32(value.data(), value.size(), 0, &hash_value);
88+
hash_value = HashBytes(value);
8389
} else if constexpr (std::is_same_v<T, std::monostate> ||
8490
std::is_same_v<T, bool> || std::is_same_v<T, float> ||
8591
std::is_same_v<T, double> ||
@@ -129,12 +135,30 @@ Result<std::unique_ptr<TransformFunction>> BucketTransform::Make(
129135
return std::make_unique<BucketTransform>(source_type, num_buckets);
130136
}
131137

138+
int32_t BucketTransform::HashBytes(std::span<const uint8_t> bytes) {
139+
int32_t hash_value = 0;
140+
MurmurHash3_x86_32(bytes.data(), bytes.size(), 0, &hash_value);
141+
return hash_value;
142+
}
143+
144+
int32_t BucketTransform::HashInt(int32_t value) {
145+
return HashLong(static_cast<int64_t>(value));
146+
}
147+
148+
int32_t BucketTransform::HashLong(int64_t value) {
149+
int32_t hash_value = 0;
150+
value = ToLittleEndian(value);
151+
MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value);
152+
return hash_value;
153+
}
154+
132155
TruncateTransform::TruncateTransform(std::shared_ptr<Type> const& source_type,
133156
int32_t width)
134157
: TransformFunction(TransformType::kTruncate, source_type), width_(width) {}
135158

136159
Result<Literal> TruncateTransform::Transform(const Literal& literal) {
137-
assert(literal.type() == source_type());
160+
ICEBERG_DCHECK(*literal.type() == *source_type(),
161+
"Literal type must match source type");
138162
if (literal.IsBelowMin() || literal.IsAboveMax()) {
139163
return InvalidArgument(
140164
"Cannot apply truncate transform to literal with value {} of type {}",
@@ -205,7 +229,8 @@ YearTransform::YearTransform(std::shared_ptr<Type> const& source_type)
205229
: TransformFunction(TransformType::kTruncate, source_type) {}
206230

207231
Result<Literal> YearTransform::Transform(const Literal& literal) {
208-
assert(literal.type() == source_type());
232+
ICEBERG_DCHECK(*literal.type() == *source_type(),
233+
"Literal type must match source type");
209234
if (literal.IsBelowMin() || literal.IsAboveMax()) {
210235
return InvalidArgument(
211236
"Cannot apply year transform to literal with value {} of type {}",
@@ -258,7 +283,8 @@ MonthTransform::MonthTransform(std::shared_ptr<Type> const& source_type)
258283
: TransformFunction(TransformType::kMonth, source_type) {}
259284

260285
Result<Literal> MonthTransform::Transform(const Literal& literal) {
261-
assert(literal.type() == source_type());
286+
ICEBERG_DCHECK(*literal.type() == *source_type(),
287+
"Literal type must match source type");
262288
if (literal.IsBelowMin() || literal.IsAboveMax()) {
263289
return InvalidArgument(
264290
"Cannot apply month transform to literal with value {} of type {}",
@@ -323,7 +349,8 @@ DayTransform::DayTransform(std::shared_ptr<Type> const& source_type)
323349
: TransformFunction(TransformType::kDay, source_type) {}
324350

325351
Result<Literal> DayTransform::Transform(const Literal& literal) {
326-
assert(literal.type() == source_type());
352+
ICEBERG_DCHECK(*literal.type() == *source_type(),
353+
"Literal type must match source type");
327354
if (literal.IsBelowMin() || literal.IsAboveMax()) {
328355
return InvalidArgument(
329356
"Cannot apply day transform to literal with value {} of type {}",
@@ -376,7 +403,8 @@ HourTransform::HourTransform(std::shared_ptr<Type> const& source_type)
376403
: TransformFunction(TransformType::kHour, source_type) {}
377404

378405
Result<Literal> HourTransform::Transform(const Literal& literal) {
379-
assert(literal.type() == source_type());
406+
ICEBERG_DCHECK(*literal.type() == *source_type(),
407+
"Literal type must match source type");
380408
if (literal.IsBelowMin() || literal.IsAboveMax()) {
381409
return InvalidArgument(
382410
"Cannot apply hour transform to literal with value {} of type {}",

src/iceberg/transform_function.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ class BucketTransform : public TransformFunction {
5151
BucketTransform(std::shared_ptr<Type> const& source_type, int32_t num_buckets);
5252

5353
/// \brief Applies the bucket hash function to the input Literal.
54+
///
55+
/// Reference:
56+
/// - https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
5457
Result<Literal> Transform(const Literal& literal) override;
5558

5659
/// \brief Returns INT32 as the output type.
@@ -63,6 +66,24 @@ class BucketTransform : public TransformFunction {
6366
static Result<std::unique_ptr<TransformFunction>> Make(
6467
std::shared_ptr<Type> const& source_type, int32_t num_buckets);
6568

69+
/// \brief Hash a byte array using MurmurHash3 and return a 32-bit hash value.
70+
/// \param bytes The input byte array to hash.
71+
/// \return A 32-bit hash value.
72+
static int32_t HashBytes(std::span<const uint8_t> bytes);
73+
74+
/// \brief Hash a 32-bit integer using MurmurHash3 and return a 32-bit hash value.
75+
/// \param value The input integer to hash.
76+
/// \note Integer and long hash results must be identical for all integer values. This
77+
/// ensures that schema evolution does not change bucket partition values if integer
78+
/// types are promoted.
79+
/// \return A 32-bit hash value.
80+
static int32_t HashInt(int32_t value);
81+
82+
/// \brief Hash a 64-bit integer using MurmurHash3 and return a 32-bit hash value.
83+
/// \param value The input long to hash.
84+
/// \return A 32-bit hash value.
85+
static int32_t HashLong(int64_t value);
86+
6687
private:
6788
int32_t num_buckets_;
6889
};

src/iceberg/util/decimal.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,8 +524,8 @@ std::vector<uint8_t> Decimal::ToBigEndian(int128_t value) {
524524
// For negative numbers, keep the leading 0xff byte if the next byte has its sign bit
525525
// unset. For positive numbers, keep the leading 0x00 byte if the next byte has its
526526
// sign bit set.
527-
if ((is_negative && byte == 0xff && (next & 0x80) == 0) ||
528-
(!is_negative && byte == 0x00 && (next & 0x80) != 0)) {
527+
if ((is_negative && byte == 0xff && (next & 0x80)) ||
528+
(!is_negative && byte == 0x00 && !(next & 0x80))) {
529529
--keep;
530530
} else {
531531
break;

test/transform_test.cc

Lines changed: 100 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,19 @@
1919

2020
#include "iceberg/transform.h"
2121

22+
#include <chrono>
2223
#include <format>
24+
#include <iostream>
2325
#include <memory>
26+
#include <string>
2427

2528
#include <gmock/gmock.h>
2629
#include <gtest/gtest.h>
2730

2831
#include "iceberg/expression/literal.h"
32+
#include "iceberg/transform_function.h"
2933
#include "iceberg/type.h"
34+
#include "iceberg/util/decimal.h"
3035
#include "iceberg/util/formatter.h" // IWYU pragma: keep
3136
#include "matchers.h"
3237

@@ -241,10 +246,75 @@ TEST(TransformLiteralTest, IdentityTransform) {
241246
}
242247
}
243248

249+
// The following tests are from
250+
// https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
251+
TEST(BucketTransformTest, HashHelper) {
252+
// int and long
253+
EXPECT_EQ(BucketTransform::HashInt(34), 2017239379);
254+
EXPECT_EQ(BucketTransform::HashLong(34L), 2017239379);
255+
256+
// decimal hash
257+
auto decimal = Decimal::FromString("14.20");
258+
ASSERT_TRUE(decimal.has_value());
259+
EXPECT_EQ(BucketTransform::HashBytes(Decimal::ToBigEndian(decimal->value())),
260+
-500754589);
261+
262+
// date hash
263+
// 2017-11-16
264+
std::chrono::sys_days sd = std::chrono::year{2017} / 11 / 16;
265+
std::chrono::sys_days epoch{std::chrono::year{1970} / 1 / 1};
266+
int32_t days = (sd - epoch).count();
267+
std::cout << "days: " << days << std::endl;
268+
EXPECT_EQ(BucketTransform::HashInt(days), -653330422);
269+
270+
// time
271+
// 22:31:08 in microseconds
272+
int64_t time_micros = (22 * 3600 + 31 * 60 + 8) * 1000000LL;
273+
std::cout << "time micros: " << time_micros << std::endl;
274+
EXPECT_EQ(BucketTransform::HashLong(time_micros), -662762989);
275+
276+
// timestamp
277+
// 2017-11-16T22:31:08 in microseconds
278+
std::chrono::system_clock::time_point tp =
279+
std::chrono::sys_days{std::chrono::year{2017} / 11 / 16} + std::chrono::hours{22} +
280+
std::chrono::minutes{31} + std::chrono::seconds{8};
281+
int64_t timestamp_micros =
282+
std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch())
283+
.count();
284+
std::cout << "timestamp micros: " << timestamp_micros << std::endl;
285+
EXPECT_EQ(BucketTransform::HashLong(timestamp_micros), -2047944441);
286+
// 2017-11-16T22:31:08.000001 in microseconds
287+
EXPECT_EQ(BucketTransform::HashLong(timestamp_micros + 1), -1207196810);
288+
289+
// string
290+
std::string str = "iceberg";
291+
EXPECT_EQ(BucketTransform::HashBytes(std::span<const uint8_t>(
292+
reinterpret_cast<const uint8_t*>(str.data()), str.size())),
293+
1210000089);
294+
295+
// uuid
296+
// f79c3e09-677c-4bbd-a479-3f349cb785e7
297+
std::array<uint8_t, 16> uuid = {0xf7, 0x9c, 0x3e, 0x09, 0x67, 0x7c, 0x4b, 0xbd,
298+
0xa4, 0x79, 0x3f, 0x34, 0x9c, 0xb7, 0x85, 0xe7};
299+
EXPECT_EQ(BucketTransform::HashBytes(uuid), 1488055340);
300+
301+
// fixed & binary
302+
std::vector<uint8_t> fixed = {0, 1, 2, 3};
303+
EXPECT_EQ(BucketTransform::HashBytes(fixed), -188683207);
304+
}
305+
244306
TEST(TransformLiteralTest, BucketTransform) {
245307
constexpr int32_t num_buckets = 4;
246308
auto transform = Transform::Bucket(num_buckets);
247309

310+
// uuid
311+
// f79c3e09-677c-4bbd-a479-3f349cb785e7
312+
std::array<uint8_t, 16> uuid = {0xf7, 0x9c, 0x3e, 0x09, 0x67, 0x7c, 0x4b, 0xbd,
313+
0xa4, 0x79, 0x3f, 0x34, 0x9c, 0xb7, 0x85, 0xe7};
314+
315+
// fixed & binary
316+
std::vector<uint8_t> fixed = {0, 1, 2, 3};
317+
248318
struct Case {
249319
std::shared_ptr<Type> source_type;
250320
Literal source;
@@ -253,23 +323,43 @@ TEST(TransformLiteralTest, BucketTransform) {
253323

254324
const std::vector<Case> cases = {
255325
{.source_type = iceberg::int32(),
256-
.source = Literal::Int(42),
326+
.source = Literal::Int(34),
257327
.expected = Literal::Int(3)},
328+
{.source_type = iceberg::int64(),
329+
.source = Literal::Long(34),
330+
.expected = Literal::Int(3)},
331+
// decimal 14.20
332+
{.source_type = iceberg::decimal(4, 2),
333+
.source = Literal::Decimal(1420, 4, 2),
334+
.expected = Literal::Int(3)},
335+
// 2017-11-16
258336
{.source_type = iceberg::date(),
259-
.source = Literal::Date(30000),
337+
.source = Literal::Date(17486),
260338
.expected = Literal::Int(2)},
261-
{.source_type = iceberg::int64(),
262-
.source = Literal::Long(1234567890),
339+
// // 22:31:08 in microseconds
340+
{.source_type = iceberg::time(),
341+
.source = Literal::Time(81068000000),
263342
.expected = Literal::Int(3)},
343+
// // 2017-11-16T22:31:08 in microseconds
264344
{.source_type = iceberg::timestamp(),
265-
.source = Literal::Timestamp(1622547800000000),
266-
.expected = Literal::Int(1)},
345+
.source = Literal::Timestamp(1510871468000000),
346+
.expected = Literal::Int(3)},
347+
// // 2017-11-16T22:31:08.000001 in microseconds
267348
{.source_type = iceberg::timestamp_tz(),
268-
.source = Literal::TimestampTz(1622547800000000),
269-
.expected = Literal::Int(1)},
349+
.source = Literal::TimestampTz(1510871468000001),
350+
.expected = Literal::Int(2)},
270351
{.source_type = iceberg::string(),
271-
.source = Literal::String("test"),
272-
.expected = Literal::Int(3)},
352+
.source = Literal::String("iceberg"),
353+
.expected = Literal::Int(1)},
354+
{.source_type = iceberg::uuid(),
355+
.source = Literal::UUID(uuid),
356+
.expected = Literal::Int(0)},
357+
{.source_type = iceberg::fixed(4),
358+
.source = Literal::Fixed(fixed),
359+
.expected = Literal::Int(1)},
360+
{.source_type = iceberg::binary(),
361+
.source = Literal::Binary(fixed),
362+
.expected = Literal::Int(1)},
273363
};
274364

275365
for (const auto& c : cases) {

0 commit comments

Comments
 (0)