1919
2020#include " iceberg/transform.h"
2121
22+ #include < chrono>
2223#include < format>
24+ #include < iostream>
2325#include < memory>
26+ #include < string>
2427
2528#include < gmock/gmock.h>
2629#include < gtest/gtest.h>
2730
2831#include " iceberg/expression/literal.h"
32+ #include " iceberg/transform_function.h"
2933#include " iceberg/type.h"
34+ #include " iceberg/util/decimal.h"
3035#include " iceberg/util/formatter.h" // IWYU pragma: keep
3136#include " matchers.h"
3237
@@ -241,10 +246,75 @@ TEST(TransformLiteralTest, IdentityTransform) {
241246 }
242247}
243248
249+ // The following tests are from
250+ // https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements
251+ TEST (BucketTransformTest, HashHelper) {
252+ // int and long
253+ EXPECT_EQ (BucketTransform::HashInt (34 ), 2017239379 );
254+ EXPECT_EQ (BucketTransform::HashLong (34L ), 2017239379 );
255+
256+ // decimal hash
257+ auto decimal = Decimal::FromString (" 14.20" );
258+ ASSERT_TRUE (decimal.has_value ());
259+ EXPECT_EQ (BucketTransform::HashBytes (Decimal::ToBigEndian (decimal->value ())),
260+ -500754589 );
261+
262+ // date hash
263+ // 2017-11-16
264+ std::chrono::sys_days sd = std::chrono::year{2017 } / 11 / 16 ;
265+ std::chrono::sys_days epoch{std::chrono::year{1970 } / 1 / 1 };
266+ int32_t days = (sd - epoch).count ();
267+ std::cout << " days: " << days << std::endl;
268+ EXPECT_EQ (BucketTransform::HashInt (days), -653330422 );
269+
270+ // time
271+ // 22:31:08 in microseconds
272+ int64_t time_micros = (22 * 3600 + 31 * 60 + 8 ) * 1000000LL ;
273+ std::cout << " time micros: " << time_micros << std::endl;
274+ EXPECT_EQ (BucketTransform::HashLong (time_micros), -662762989 );
275+
276+ // timestamp
277+ // 2017-11-16T22:31:08 in microseconds
278+ std::chrono::system_clock::time_point tp =
279+ std::chrono::sys_days{std::chrono::year{2017 } / 11 / 16 } + std::chrono::hours{22 } +
280+ std::chrono::minutes{31 } + std::chrono::seconds{8 };
281+ int64_t timestamp_micros =
282+ std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch ())
283+ .count ();
284+ std::cout << " timestamp micros: " << timestamp_micros << std::endl;
285+ EXPECT_EQ (BucketTransform::HashLong (timestamp_micros), -2047944441 );
286+ // 2017-11-16T22:31:08.000001 in microseconds
287+ EXPECT_EQ (BucketTransform::HashLong (timestamp_micros + 1 ), -1207196810 );
288+
289+ // string
290+ std::string str = " iceberg" ;
291+ EXPECT_EQ (BucketTransform::HashBytes (std::span<const uint8_t >(
292+ reinterpret_cast <const uint8_t *>(str.data ()), str.size ())),
293+ 1210000089 );
294+
295+ // uuid
296+ // f79c3e09-677c-4bbd-a479-3f349cb785e7
297+ std::array<uint8_t , 16 > uuid = {0xf7 , 0x9c , 0x3e , 0x09 , 0x67 , 0x7c , 0x4b , 0xbd ,
298+ 0xa4 , 0x79 , 0x3f , 0x34 , 0x9c , 0xb7 , 0x85 , 0xe7 };
299+ EXPECT_EQ (BucketTransform::HashBytes (uuid), 1488055340 );
300+
301+ // fixed & binary
302+ std::vector<uint8_t > fixed = {0 , 1 , 2 , 3 };
303+ EXPECT_EQ (BucketTransform::HashBytes (fixed), -188683207 );
304+ }
305+
244306TEST (TransformLiteralTest, BucketTransform) {
245307 constexpr int32_t num_buckets = 4 ;
246308 auto transform = Transform::Bucket (num_buckets);
247309
310+ // uuid
311+ // f79c3e09-677c-4bbd-a479-3f349cb785e7
312+ std::array<uint8_t , 16 > uuid = {0xf7 , 0x9c , 0x3e , 0x09 , 0x67 , 0x7c , 0x4b , 0xbd ,
313+ 0xa4 , 0x79 , 0x3f , 0x34 , 0x9c , 0xb7 , 0x85 , 0xe7 };
314+
315+ // fixed & binary
316+ std::vector<uint8_t > fixed = {0 , 1 , 2 , 3 };
317+
248318 struct Case {
249319 std::shared_ptr<Type> source_type;
250320 Literal source;
@@ -253,23 +323,43 @@ TEST(TransformLiteralTest, BucketTransform) {
253323
254324 const std::vector<Case> cases = {
255325 {.source_type = iceberg::int32 (),
256- .source = Literal::Int (42 ),
326+ .source = Literal::Int (34 ),
257327 .expected = Literal::Int (3 )},
328+ {.source_type = iceberg::int64 (),
329+ .source = Literal::Long (34 ),
330+ .expected = Literal::Int (3 )},
331+ // decimal 14.20
332+ {.source_type = iceberg::decimal (4 , 2 ),
333+ .source = Literal::Decimal (1420 , 4 , 2 ),
334+ .expected = Literal::Int (3 )},
335+ // 2017-11-16
258336 {.source_type = iceberg::date (),
259- .source = Literal::Date (30000 ),
337+ .source = Literal::Date (17486 ),
260338 .expected = Literal::Int (2 )},
261- {.source_type = iceberg::int64 (),
262- .source = Literal::Long (1234567890 ),
339+ // // 22:31:08 in microseconds
340+ {.source_type = iceberg::time (),
341+ .source = Literal::Time (81068000000 ),
263342 .expected = Literal::Int (3 )},
343+ // // 2017-11-16T22:31:08 in microseconds
264344 {.source_type = iceberg::timestamp (),
265- .source = Literal::Timestamp (1622547800000000 ),
266- .expected = Literal::Int (1 )},
345+ .source = Literal::Timestamp (1510871468000000 ),
346+ .expected = Literal::Int (3 )},
347+ // // 2017-11-16T22:31:08.000001 in microseconds
267348 {.source_type = iceberg::timestamp_tz (),
268- .source = Literal::TimestampTz (1622547800000000 ),
269- .expected = Literal::Int (1 )},
349+ .source = Literal::TimestampTz (1510871468000001 ),
350+ .expected = Literal::Int (2 )},
270351 {.source_type = iceberg::string (),
271- .source = Literal::String (" test" ),
272- .expected = Literal::Int (3 )},
352+ .source = Literal::String (" iceberg" ),
353+ .expected = Literal::Int (1 )},
354+ {.source_type = iceberg::uuid (),
355+ .source = Literal::UUID (uuid),
356+ .expected = Literal::Int (0 )},
357+ {.source_type = iceberg::fixed (4 ),
358+ .source = Literal::Fixed (fixed),
359+ .expected = Literal::Int (1 )},
360+ {.source_type = iceberg::binary (),
361+ .source = Literal::Binary (fixed),
362+ .expected = Literal::Int (1 )},
273363 };
274364
275365 for (const auto & c : cases) {
0 commit comments