Skip to content

Commit 2588e13

Browse files
authored
fix: partially fix consistency issue of hash functions with decimal input (#1295)
1 parent c4ef5ed commit 2588e13

File tree

2 files changed

+42
-0
lines changed

2 files changed

+42
-0
lines changed

native/spark-expr/src/hash_funcs/utils.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,26 @@ macro_rules! hash_array_primitive_float {
104104
};
105105
}
106106

107+
#[macro_export]
108+
macro_rules! hash_array_small_decimal {
109+
($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
110+
let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
111+
112+
if array.null_count() == 0 {
113+
for (i, hash) in $hashes.iter_mut().enumerate() {
114+
*hash = $hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
115+
}
116+
} else {
117+
for (i, hash) in $hashes.iter_mut().enumerate() {
118+
if !array.is_null(i) {
119+
*hash =
120+
$hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
121+
}
122+
}
123+
}
124+
};
125+
}
126+
107127
#[macro_export]
108128
macro_rules! hash_array_decimal {
109129
($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
@@ -274,6 +294,11 @@ macro_rules! create_hashes_internal {
274294
DataType::FixedSizeBinary(_) => {
275295
$crate::hash_array!(FixedSizeBinaryArray, col, $hashes_buffer, $hash_method);
276296
}
297+
// Apache Spark: if it's a small decimal, i.e. precision <= 18, turn it into long and hash it.
298+
// Else, turn it into bytes and hash it.
299+
DataType::Decimal128(precision, _) if *precision <= 18 => {
300+
$crate::hash_array_small_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
301+
}
277302
DataType::Decimal128(_, _) => {
278303
$crate::hash_array_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
279304
}

spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1928,6 +1928,23 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
19281928
}
19291929
}
19301930
}
1931+
1932+
test("hash functions with decimal input") {
1933+
withTable("t1", "t2") {
1934+
// Apache Spark: if it's a small decimal, i.e. precision <= 18, turn it into long and hash it.
1935+
// Else, turn it into bytes and hash it.
1936+
sql("create table t1(c1 decimal(18, 2)) using parquet")
1937+
sql("insert into t1 values(1.23), (-1.23), (0.0), (null)")
1938+
checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from t1 order by c1")
1939+
1940+
// TODO: comet hash function is not compatible with spark for decimal with precision greater than 18.
1941+
// https://github.com/apache/datafusion-comet/issues/1294
1942+
// sql("create table t2(c1 decimal(20, 2)) using parquet")
1943+
// sql("insert into t2 values(1.23), (-1.23), (0.0), (null)")
1944+
// checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from t2 order by c1")
1945+
}
1946+
}
1947+
19311948
test("unary negative integer overflow test") {
19321949
def withAnsiMode(enabled: Boolean)(f: => Unit): Unit = {
19331950
withSQLConf(

0 commit comments

Comments
 (0)