Skip to content

Commit dca45ea

Browse files
authored
chore: Add microbenchmark for casting string to temporal types (apache#2980)
1 parent fc122e8 commit dca45ea

File tree

1 file changed

+101
-0
lines changed

1 file changed

+101
-0
lines changed
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.spark.sql.benchmark
21+
22+
case class CastStringToTemporalConfig(
23+
name: String,
24+
query: String,
25+
extraCometConfigs: Map[String, String] = Map.empty)
26+
27+
// spotless:off
28+
/**
29+
* Benchmark to measure performance of Comet cast from String to temporal types. To run this
30+
* benchmark:
31+
* `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark`
32+
* Results will be written to "spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt".
33+
*/
34+
// spotless:on
35+
object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
36+
37+
// Configuration for String to temporal cast benchmarks
38+
private val dateCastConfigs = List(
39+
CastStringToTemporalConfig(
40+
"Cast String to Date",
41+
"SELECT CAST(c1 AS DATE) FROM parquetV1Table"),
42+
CastStringToTemporalConfig(
43+
"Try_Cast String to Date",
44+
"SELECT TRY_CAST(c1 AS DATE) FROM parquetV1Table"))
45+
46+
private val timestampCastConfigs = List(
47+
CastStringToTemporalConfig(
48+
"Cast String to Timestamp",
49+
"SELECT CAST(c1 AS TIMESTAMP) FROM parquetV1Table"),
50+
CastStringToTemporalConfig(
51+
"Try_Cast String to Timestamp",
52+
"SELECT TRY_CAST(c1 AS TIMESTAMP) FROM parquetV1Table"))
53+
54+
override def runCometBenchmark(mainArgs: Array[String]): Unit = {
55+
val values = 1024 * 1024 * 10 // 10M rows
56+
57+
// Generate date data once with ~10% invalid values
58+
runBenchmarkWithTable("date data generation", values) { v =>
59+
withTempPath { dateDir =>
60+
withTempTable("parquetV1Table") {
61+
prepareTable(
62+
dateDir,
63+
spark.sql(s"""
64+
SELECT CASE
65+
WHEN value % 10 = 0 THEN 'invalid-date'
66+
ELSE CAST(DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) AS STRING)
67+
END AS c1
68+
FROM $tbl
69+
"""))
70+
71+
// Run date cast benchmarks with the same data
72+
dateCastConfigs.foreach { config =>
73+
runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
74+
}
75+
}
76+
}
77+
}
78+
79+
// Generate timestamp data once with ~10% invalid values
80+
runBenchmarkWithTable("timestamp data generation", values) { v =>
81+
withTempPath { timestampDir =>
82+
withTempTable("parquetV1Table") {
83+
prepareTable(
84+
timestampDir,
85+
spark.sql(s"""
86+
SELECT CASE
87+
WHEN value % 10 = 0 THEN 'not-a-timestamp'
88+
ELSE CAST(TIMESTAMP_MICROS(value % 9999999999) AS STRING)
89+
END AS c1
90+
FROM $tbl
91+
"""))
92+
93+
// Run timestamp cast benchmarks with the same data
94+
timestampCastConfigs.foreach { config =>
95+
runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs)
96+
}
97+
}
98+
}
99+
}
100+
}
101+
}

0 commit comments

Comments
 (0)