Skip to content

Commit 092d88c

Browse files
authored
perf: Add microbenchmark for hash expressions (#3028)
1 parent 89ebbd7 commit 092d88c

File tree

1 file changed

+74
-0
lines changed

1 file changed

+74
-0
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.spark.sql.benchmark
21+
22+
case class HashExprConfig(
23+
name: String,
24+
query: String,
25+
extraCometConfigs: Map[String, String] = Map.empty)
26+
27+
/**
28+
* Comprehensive benchmark for Comet hash expressions. To run this benchmark:
29+
* {{{
30+
* SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometHashExpressionBenchmark
31+
* }}}
32+
* Results will be written to "spark/benchmarks/CometHashExpressionBenchmark-**results.txt".
33+
*/
34+
object CometHashExpressionBenchmark extends CometBenchmarkBase {
35+
36+
private val hashExpressions = List(
37+
HashExprConfig("xxhash64_single", "SELECT xxhash64(c_str) FROM parquetV1Table"),
38+
HashExprConfig("xxhash64_multi", "SELECT xxhash64(c_str, c_int, c_long) FROM parquetV1Table"),
39+
HashExprConfig("murmur3_hash_single", "SELECT hash(c_str) FROM parquetV1Table"),
40+
HashExprConfig("murmur3_hash_multi", "SELECT hash(c_str, c_int, c_long) FROM parquetV1Table"),
41+
HashExprConfig("sha1", "SELECT sha1(c_str) FROM parquetV1Table"),
42+
HashExprConfig("sha2_224", "SELECT sha2(c_str, 224) FROM parquetV1Table"),
43+
HashExprConfig("sha2_256", "SELECT sha2(c_str, 256) FROM parquetV1Table"),
44+
HashExprConfig("sha2_384", "SELECT sha2(c_str, 384) FROM parquetV1Table"),
45+
HashExprConfig("sha2_512", "SELECT sha2(c_str, 512) FROM parquetV1Table"))
46+
47+
override def runCometBenchmark(mainArgs: Array[String]): Unit = {
48+
val values = 1024 * 1024
49+
50+
runBenchmarkWithTable("Hash expression benchmarks", values) { v =>
51+
withTempPath { dir =>
52+
withTempTable("parquetV1Table") {
53+
// Data distribution: 1% NULL per column
54+
// - c_str: unique strings "string_0" through "string_N"
55+
// - c_int: integers 0-999,999 (cycling)
56+
// - c_long: large values 0 to ~1 billion
57+
prepareTable(
58+
dir,
59+
spark.sql(s"""
60+
SELECT
61+
CASE WHEN value % 100 = 0 THEN NULL ELSE CONCAT('string_', CAST(value AS STRING)) END AS c_str,
62+
CASE WHEN value % 100 = 1 THEN NULL ELSE CAST(value % 1000000 AS INT) END AS c_int,
63+
CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value * 1000 AS LONG) END AS c_long
64+
FROM $tbl
65+
"""))
66+
67+
hashExpressions.foreach { config =>
68+
runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
69+
}
70+
}
71+
}
72+
}
73+
}
74+
}

0 commit comments

Comments
 (0)