Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions src/iceberg/expression/literal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -554,4 +554,51 @@ Result<Literal> LiteralCaster::CastTo(const Literal& literal,
target_type->ToString());
}

// LiteralValueHash implementation
std::size_t LiteralValueHash::operator()(const Literal::Value& value) const noexcept {
return std::visit(
[](const auto& v) -> std::size_t {
using T = std::decay_t<decltype(v)>;

constexpr size_t kHashPrime = 0x9e3779b9;

if constexpr (std::is_same_v<T, std::monostate>) {
return 0;
} else if constexpr (std::is_same_v<T, Literal::BelowMin>) {
return std::numeric_limits<std::size_t>::min();
} else if constexpr (std::is_same_v<T, Literal::AboveMax>) {
return std::numeric_limits<std::size_t>::max();
} else if constexpr (std::is_same_v<T, bool> || std::is_same_v<T, int32_t> ||
std::is_same_v<T, int64_t> || std::is_same_v<T, float> ||
std::is_same_v<T, double> ||
std::is_same_v<T, std::string>) {
return std::hash<T>{}(v);
} else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
std::size_t hash = 0;
for (size_t i = 0; i < v.size(); ++i) {
hash ^= std::hash<uint8_t>{}(v[i]) + kHashPrime + (hash << 6) + (hash >> 2);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hashing identical bytes at different positions can produce same hash (e.g., [1,2] and [2,1] might collide).
Can we add position i into the hash value to decrease the possibility of collide?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hashing identical bytes at different positions can produce same hash (e.g., [1,2] and [2,1] might collide).

I don't think this statement is correct, the left and right shifts should ensure a difference.
I ran a quick demo on Godbolt [1], and as you can see, the hashes of [1,2] and [2,1] are different.

[1] https://godbolt.org/z/4ss9E749q

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I traced through the logic again and realized you are right. The hash is position-sensitive because each iteration depends on the accumulated hash state from previous iterations.

}
return hash;
} else if constexpr (std::is_same_v<T, Decimal>) {
const int128_t& val = v.value();
std::size_t hash = std::hash<uint64_t>{}(static_cast<uint64_t>(val >> 64));
hash ^= std::hash<uint64_t>{}(static_cast<uint64_t>(val)) + kHashPrime +
(hash << 6) + (hash >> 2);
return hash;
} else if constexpr (std::is_same_v<T, Uuid>) {
std::size_t hash = 0;
const auto& bytes = v.bytes();
for (size_t i = 0; i < bytes.size(); ++i) {
hash ^=
std::hash<uint8_t>{}(bytes[i]) + kHashPrime + (hash << 6) + (hash >> 2);
}
return hash;
} else {
static_assert(sizeof(T) == 0, "Unhandled variant type in LiteralValueHash");
return 0;
}
},
value);
}

} // namespace iceberg
98 changes: 31 additions & 67 deletions src/iceberg/expression/literal.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,79 +166,43 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
std::shared_ptr<PrimitiveType> type_;
};

template <TypeId type_id>
struct LiteralTraits {
using ValueType = void;
};

template <>
struct LiteralTraits<TypeId::kBoolean> {
using ValueType = bool;
};

template <>
struct LiteralTraits<TypeId::kInt> {
using ValueType = int32_t;
};

template <>
struct LiteralTraits<TypeId::kDate> {
using ValueType = int32_t;
};

template <>
struct LiteralTraits<TypeId::kLong> {
using ValueType = int64_t;
};

template <>
struct LiteralTraits<TypeId::kTime> {
using ValueType = int64_t;
};

template <>
struct LiteralTraits<TypeId::kTimestamp> {
using ValueType = int64_t;
/// \brief Hash function for Literal to facilitate use in unordered containers
struct ICEBERG_EXPORT LiteralValueHash {
std::size_t operator()(const Literal::Value& value) const noexcept;
};

template <>
struct LiteralTraits<TypeId::kTimestampTz> {
using ValueType = int64_t;
};

template <>
struct LiteralTraits<TypeId::kFloat> {
using ValueType = float;
};

template <>
struct LiteralTraits<TypeId::kDouble> {
using ValueType = double;
};

template <>
struct LiteralTraits<TypeId::kDecimal> {
using ValueType = Decimal;
};

template <>
struct LiteralTraits<TypeId::kString> {
using ValueType = std::string;
struct ICEBERG_EXPORT LiteralHash {
std::size_t operator()(const Literal& value) const noexcept {
return LiteralValueHash{}(value.value());
}
};

template <>
struct LiteralTraits<TypeId::kUuid> {
using ValueType = Uuid;
template <TypeId type_id>
struct LiteralTraits {
using ValueType = void;
};

template <>
struct LiteralTraits<TypeId::kBinary> {
using ValueType = std::vector<uint8_t>;
};
#define DEFINE_LITERAL_TRAIT(TYPE_ID, VALUE_TYPE) \
template <> \
struct LiteralTraits<TypeId::TYPE_ID> { \
using ValueType = VALUE_TYPE; \
};

template <>
struct LiteralTraits<TypeId::kFixed> {
using ValueType = std::vector<uint8_t>;
};
DEFINE_LITERAL_TRAIT(kBoolean, bool)
DEFINE_LITERAL_TRAIT(kInt, int32_t)
DEFINE_LITERAL_TRAIT(kDate, int32_t)
DEFINE_LITERAL_TRAIT(kLong, int64_t)
DEFINE_LITERAL_TRAIT(kTime, int64_t)
DEFINE_LITERAL_TRAIT(kTimestamp, int64_t)
DEFINE_LITERAL_TRAIT(kTimestampTz, int64_t)
DEFINE_LITERAL_TRAIT(kFloat, float)
DEFINE_LITERAL_TRAIT(kDouble, double)
DEFINE_LITERAL_TRAIT(kDecimal, Decimal)
DEFINE_LITERAL_TRAIT(kString, std::string)
DEFINE_LITERAL_TRAIT(kUuid, Uuid)
DEFINE_LITERAL_TRAIT(kBinary, std::vector<uint8_t>)
DEFINE_LITERAL_TRAIT(kFixed, std::vector<uint8_t>)

#undef DEFINE_LITERAL_TRAIT

} // namespace iceberg
Loading
Loading