diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala deleted file mode 100644 index 975abd632f..0000000000 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.spark.sql.benchmark - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DataType, LongType} - -import org.apache.comet.expressions.{CometCast, CometEvalMode} -import org.apache.comet.serde.{Compatible, Incompatible, Unsupported} - -/** - * Benchmark to measure Comet execution performance. To run this benchmark: - * {{{ - * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBenchmark - * }}} - * - * Results will be written to "spark/benchmarks/CometCastBenchmark-**results.txt". - */ - -object CometCastBenchmark extends CometBenchmarkBase { - - override def getSparkSession: SparkSession = { - val session = super.getSparkSession - session.conf.set("parquet.enable.dictionary", "false") - session.conf.set("spark.sql.shuffle.partitions", "2") - session - } - - def castExprSQL(toDataType: DataType, input: String): String = { - s"CAST ($input AS ${toDataType.sql})" - } - - override def runCometBenchmark(args: Array[String]): Unit = { - - // TODO : Create all possible input datatypes. We only have Long inputs for now - CometCast.supportedTypes.foreach { toDataType => - Seq(false, true).foreach { ansiMode => - CometCast.isSupported( - LongType, - toDataType, - None, - if (ansiMode) CometEvalMode.ANSI else CometEvalMode.LEGACY) match { - case Compatible(notes) => - runBenchmarkWithTable( - s"Running benchmark cast operation from : $LongType to : $toDataType", - 1024 * 1024 * 10) { v => - castBenchmark(v, LongType, toDataType, isAnsiMode = ansiMode) - } - case Incompatible(notes) => None - case Unsupported(notes) => None - } - } - } - } - - def castBenchmark( - values: Int, - fromDataType: DataType, - toDataType: DataType, - isAnsiMode: Boolean): Unit = { - - withTempPath { dir => - withTempTable("parquetV1Table") { - prepareTable(dir, spark.sql(s"SELECT value FROM $tbl")) - - val functionSQL = castExprSQL(toDataType, "value") - val query = s"SELECT $functionSQL FROM parquetV1Table" - val name = - s"Cast function to : ${toDataType} , ansi mode enabled : ${isAnsiMode}" - - val extraConfigs = Map(SQLConf.ANSI_ENABLED.key -> isAnsiMode.toString) - - runExpressionBenchmark(name, values, query, extraConfigs) - } - } - } - -} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala new file mode 100644 index 0000000000..57b8e88a7b --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastBooleanConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast operations involving Boolean type. To run this + * benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark + * }}} + * Results will be written to "spark/benchmarks/CometCastBooleanBenchmark-**results.txt". + */ +object CometCastBooleanBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // Boolean to String + private val boolToStringConfigs = for { + castFunc <- castFunctions + } yield CastBooleanConfig( + s"$castFunc Boolean to String", + s"SELECT $castFunc(c_bool AS STRING) FROM parquetV1Table") + + // Boolean to numeric types + private val boolToNumericTypes = + Seq("BYTE", "SHORT", "INT", "LONG", "FLOAT", "DOUBLE", "DECIMAL(10,2)") + private val boolToNumericConfigs = for { + castFunc <- castFunctions + targetType <- boolToNumericTypes + } yield CastBooleanConfig( + s"$castFunc Boolean to $targetType", + s"SELECT $castFunc(c_bool AS $targetType) FROM parquetV1Table") + + // Numeric to Boolean + private val numericTypes = Seq( + ("BYTE", "c_byte"), + ("SHORT", "c_short"), + ("INT", "c_int"), + ("LONG", "c_long"), + ("FLOAT", "c_float"), + ("DOUBLE", "c_double"), + ("DECIMAL(10,2)", "c_decimal")) + + private val numericToBoolConfigs = for { + castFunc <- castFunctions + (sourceType, colName) <- numericTypes + } yield CastBooleanConfig( + s"$castFunc $sourceType to Boolean", + s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 // 1M rows + + // Generate boolean data for boolean-to-other casts + runBenchmarkWithTable("Boolean to other types casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, 50/50 true/false + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE (value % 2 = 0) + END AS c_bool + FROM $tbl + """)) + + (boolToStringConfigs ++ boolToNumericConfigs).foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate numeric data for numeric-to-boolean casts + runBenchmarkWithTable("Numeric to Boolean casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL per column, values in {-1, 0, 1} (~33% each) + prepareTable( + dir, + spark.sql(s""" + SELECT + CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 3) - 1 AS BYTE) END AS c_byte, + CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 3) - 1 AS SHORT) END AS c_short, + CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 3) - 1 AS INT) END AS c_int, + CASE WHEN value % 100 = 3 THEN NULL ELSE CAST((value % 3) - 1 AS LONG) END AS c_long, + CASE WHEN value % 100 = 4 THEN NULL ELSE CAST((value % 3) - 1 AS FLOAT) END AS c_float, + CASE WHEN value % 100 = 5 THEN NULL ELSE CAST((value % 3) - 1 AS DOUBLE) END AS c_double, + CASE WHEN value % 100 = 6 THEN NULL ELSE CAST((value % 3) - 1 AS DECIMAL(10,2)) END AS c_decimal + FROM $tbl + """)) + + numericToBoolConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala new file mode 100644 index 0000000000..a9ea19a0e9 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastNumericToNumericConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast between numeric types. To run this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark + * }}} + * Results will be written to "spark/benchmarks/CometCastNumericToNumericBenchmark-**results.txt". + */ +object CometCastNumericToNumericBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // Integer widening conversions + private val integerWideningPairs = Seq( + ("BYTE", "c_byte", "SHORT"), + ("BYTE", "c_byte", "INT"), + ("BYTE", "c_byte", "LONG"), + ("SHORT", "c_short", "INT"), + ("SHORT", "c_short", "LONG"), + ("INT", "c_int", "LONG")) + + // Integer narrowing conversions + private val integerNarrowingPairs = Seq( + ("LONG", "c_long", "INT"), + ("LONG", "c_long", "SHORT"), + ("LONG", "c_long", "BYTE"), + ("INT", "c_int", "SHORT"), + ("INT", "c_int", "BYTE"), + ("SHORT", "c_short", "BYTE")) + + // Floating point conversions + private val floatPairs = Seq(("FLOAT", "c_float", "DOUBLE"), ("DOUBLE", "c_double", "FLOAT")) + + // Integer to floating point conversions + private val intToFloatPairs = Seq( + ("BYTE", "c_byte", "FLOAT"), + ("SHORT", "c_short", "FLOAT"), + ("INT", "c_int", "FLOAT"), + ("LONG", "c_long", "FLOAT"), + ("INT", "c_int", "DOUBLE"), + ("LONG", "c_long", "DOUBLE")) + + // Floating point to integer conversions + private val floatToIntPairs = Seq( + ("FLOAT", "c_float", "INT"), + ("FLOAT", "c_float", "LONG"), + ("DOUBLE", "c_double", "INT"), + ("DOUBLE", "c_double", "LONG")) + + // Decimal conversions + private val decimalPairs = Seq( + ("INT", "c_int", "DECIMAL(10,2)"), + ("LONG", "c_long", "DECIMAL(20,4)"), + ("DOUBLE", "c_double", "DECIMAL(15,5)"), + ("DECIMAL(10,2)", "c_decimal", "INT"), + ("DECIMAL(10,2)", "c_decimal", "LONG"), + ("DECIMAL(10,2)", "c_decimal", "DOUBLE")) + + private def generateConfigs( + pairs: Seq[(String, String, String)]): Seq[CastNumericToNumericConfig] = { + for { + castFunc <- castFunctions + (sourceType, colName, targetType) <- pairs + } yield CastNumericToNumericConfig( + s"$castFunc $sourceType to $targetType", + s"SELECT $castFunc($colName AS $targetType) FROM parquetV1Table") + } + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 // 1M rows + + // Generate input data once with all numeric types + runBenchmarkWithTable("Numeric to Numeric casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL per column + // - c_byte: full range -64 to 63 + // - c_short: full range -16384 to 16383 + // - c_int: centered around 0 (-2.5M to +2.5M) + // - c_long: large positive values (0 to ~5 billion) + // - c_float/c_double: 4% special values (NaN/Infinity), rest centered around 0 + // - c_decimal: values from -25000.00 to +25000.00 + prepareTable( + dir, + spark.sql(s""" + SELECT + CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 128) - 64 AS BYTE) END AS c_byte, + CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 32768) - 16384 AS SHORT) END AS c_short, + CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value - 2500000 AS INT) END AS c_int, + CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value * 1000 AS LONG) END AS c_long, + CASE + WHEN value % 100 = 4 THEN NULL + WHEN value % 100 = 5 THEN CAST('NaN' AS FLOAT) + WHEN value % 100 = 6 THEN CAST('Infinity' AS FLOAT) + WHEN value % 100 = 7 THEN CAST('-Infinity' AS FLOAT) + ELSE CAST((value - 2500000) / 100.0 AS FLOAT) + END AS c_float, + CASE + WHEN value % 100 = 8 THEN NULL + WHEN value % 100 = 9 THEN CAST('NaN' AS DOUBLE) + WHEN value % 100 = 10 THEN CAST('Infinity' AS DOUBLE) + WHEN value % 100 = 11 THEN CAST('-Infinity' AS DOUBLE) + ELSE CAST((value - 2500000) / 100.0 AS DOUBLE) + END AS c_double, + CASE WHEN value % 100 = 12 THEN NULL ELSE CAST((value - 2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal + FROM $tbl + """)) + + // Run all benchmark categories + (generateConfigs(integerWideningPairs) ++ + generateConfigs(integerNarrowingPairs) ++ + generateConfigs(floatPairs) ++ + generateConfigs(intToFloatPairs) ++ + generateConfigs(floatToIntPairs) ++ + generateConfigs(decimalPairs)).foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala new file mode 100644 index 0000000000..1fd2138c58 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastNumericToStringConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast from numeric types to String. To run this + * benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark + * }}} + * Results will be written to "spark/benchmarks/CometCastNumericToStringBenchmark-**results.txt". + */ +object CometCastNumericToStringBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + private val sourceTypes = + Seq( + ("BOOLEAN", "c_bool"), + ("BYTE", "c_byte"), + ("SHORT", "c_short"), + ("INT", "c_int"), + ("LONG", "c_long"), + ("FLOAT", "c_float"), + ("DOUBLE", "c_double"), + ("DECIMAL(10,2)", "c_decimal")) + + private val castConfigs = for { + castFunc <- castFunctions + (sourceType, colName) <- sourceTypes + } yield CastNumericToStringConfig( + s"$castFunc $sourceType to String", + s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 // 1M rows + + // Generate input data once with all numeric types + runBenchmarkWithTable("Numeric to String casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL per column + // - c_bool: 50/50 true/false + // - c_byte: full range -64 to 63 + // - c_short: full range -16384 to 16383 + // - c_int/c_long: large values centered around 0 + // - c_float/c_double: 3% special values (NaN/Infinity), rest centered around 0 + // - c_decimal: values from -25000.00 to +25000.00 + prepareTable( + dir, + spark.sql(s""" + SELECT + CASE WHEN value % 100 = 0 THEN NULL ELSE (value % 2 = 0) END AS c_bool, + CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 128) - 64 AS BYTE) END AS c_byte, + CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 32768) - 16384 AS SHORT) END AS c_short, + CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value - 2500000 AS INT) END AS c_int, + CASE WHEN value % 100 = 4 THEN NULL ELSE CAST(value * 1000000 AS LONG) END AS c_long, + CASE + WHEN value % 100 = 5 THEN NULL + WHEN value % 100 = 6 THEN CAST('NaN' AS FLOAT) + WHEN value % 100 = 7 THEN CAST('Infinity' AS FLOAT) + WHEN value % 100 = 8 THEN CAST('-Infinity' AS FLOAT) + ELSE CAST((value - 2500000) / 1000.0 AS FLOAT) + END AS c_float, + CASE + WHEN value % 100 = 9 THEN NULL + WHEN value % 100 = 10 THEN CAST('NaN' AS DOUBLE) + WHEN value % 100 = 11 THEN CAST('Infinity' AS DOUBLE) + WHEN value % 100 = 12 THEN CAST('-Infinity' AS DOUBLE) + ELSE CAST((value - 2500000) / 100.0 AS DOUBLE) + END AS c_double, + CASE WHEN value % 100 = 13 THEN NULL ELSE CAST((value - 2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal + FROM $tbl + """)) + + castConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala new file mode 100644 index 0000000000..ec2d9ab12f --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastNumericToTemporalConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast from numeric types to temporal types. To run + * this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark + * }}} + * Results will be written to + * "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt". + */ +object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // INT to DATE (days since epoch) + private val intToDateConfigs = for { + castFunc <- castFunctions + } yield CastNumericToTemporalConfig( + s"$castFunc Int to Date", + s"SELECT $castFunc(c_int AS DATE) FROM parquetV1Table") + + // LONG to TIMESTAMP (microseconds since epoch) + private val longToTimestampConfigs = for { + castFunc <- castFunctions + } yield CastNumericToTemporalConfig( + s"$castFunc Long to Timestamp", + s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 // 1M rows + + // Generate data once for INT to DATE conversions + runBenchmarkWithTable("Int to Date casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, days since epoch spanning ~100 years (1920-2020) + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE CAST((value % 36500) - 18000 AS INT) + END AS c_int + FROM $tbl + """)) + + intToDateConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate data once for LONG to TIMESTAMP conversions + runBenchmarkWithTable("Long to Timestamp casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, microseconds since epoch spanning ~1 year from 2020-01-01 + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE 1577836800000000 + (value % 31536000000000) + END AS c_long + FROM $tbl + """)) + + longToTimestampConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala index 7f210fc730..c71eadad8c 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala @@ -68,8 +68,11 @@ object CometCastStringToNumericBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("String to numeric casts", values) { v => withTempPath { dir => withTempTable("parquetV1Table") { - // Generate numeric strings with both integer and decimal values - // Also include some special values: nulls (~2%), NaN (~2%), Infinity (~2%) + // Data distribution: + // - 2% NULL, 2% 'NaN', 2% 'Infinity', 2% '-Infinity' + // - 12% small integers (0-98) + // - 40% medium integers (0-999,998) + // - 40% decimals centered around 0 (approx -5000.00 to +5000.00) prepareTable( dir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala index 39337be5c8..77cc009ae1 100644 --- a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala @@ -24,14 +24,14 @@ case class CastStringToTemporalConfig( query: String, extraCometConfigs: Map[String, String] = Map.empty) -// spotless:off /** * Benchmark to measure performance of Comet cast from String to temporal types. To run this * benchmark: - * `SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark` + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark + * }}} * Results will be written to "spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt". */ -// spotless:on object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { // Configuration for String to temporal cast benchmarks @@ -52,12 +52,13 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { "SELECT TRY_CAST(c1 AS TIMESTAMP) FROM parquetV1Table")) override def runCometBenchmark(mainArgs: Array[String]): Unit = { - val values = 1024 * 1024 * 10 // 10M rows + val values = 1024 * 1024 // 1M rows // Generate date data once with ~10% invalid values runBenchmarkWithTable("date data generation", values) { v => withTempPath { dateDir => withTempTable("parquetV1Table") { + // Data distribution: 10% invalid strings, 90% valid date strings spanning ~10 years prepareTable( dateDir, spark.sql(s""" @@ -80,6 +81,7 @@ object CometCastStringToTemporalBenchmark extends CometBenchmarkBase { runBenchmarkWithTable("timestamp data generation", values) { v => withTempPath { timestampDir => withTempTable("parquetV1Table") { + // Data distribution: 10% invalid strings, 90% valid timestamp strings (1970 epoch range) prepareTable( timestampDir, spark.sql(s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala new file mode 100644 index 0000000000..1468cbe086 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastTemporalToNumericConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast from temporal types to numeric types. To run + * this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark + * }}} + * Results will be written to + * "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt". + */ +object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // DATE to numeric types + private val dateToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG") + private val dateToNumericConfigs = for { + castFunc <- castFunctions + targetType <- dateToNumericTypes + } yield CastTemporalToNumericConfig( + s"$castFunc Date to $targetType", + s"SELECT $castFunc(c_date AS $targetType) FROM parquetV1Table") + + // TIMESTAMP to numeric types + private val timestampToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG") + private val timestampToNumericConfigs = for { + castFunc <- castFunctions + targetType <- timestampToNumericTypes + } yield CastTemporalToNumericConfig( + s"$castFunc Timestamp to $targetType", + s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 // 1M rows + + // Generate DATE data once for all date-to-numeric benchmarks + runBenchmarkWithTable("Date to Numeric casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01 + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) + END AS c_date + FROM $tbl + """)) + + dateToNumericConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate TIMESTAMP data once for all timestamp-to-numeric benchmarks + runBenchmarkWithTable("Timestamp to Numeric casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01 + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000) + END AS c_timestamp + FROM $tbl + """)) + + timestampToNumericConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala new file mode 100644 index 0000000000..1ef3e7711d --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastTemporalToStringConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast from temporal types to String. To run this + * benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark + * }}} + * Results will be written to "spark/benchmarks/CometCastTemporalToStringBenchmark-**results.txt". + */ +object CometCastTemporalToStringBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + private val dateCastConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToStringConfig( + s"$castFunc Date to String", + s"SELECT $castFunc(c_date AS STRING) FROM parquetV1Table") + + private val timestampCastConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToStringConfig( + s"$castFunc Timestamp to String", + s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 // 1M rows + + // Generate temporal data once for date benchmarks + runBenchmarkWithTable("Date to String casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01 + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) + END AS c_date + FROM $tbl + """)) + + dateCastConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate temporal data once for timestamp benchmarks + runBenchmarkWithTable("Timestamp to String casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01 + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000) + END AS c_timestamp + FROM $tbl + """)) + + timestampCastConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala new file mode 100644 index 0000000000..f2e2572487 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +case class CastTemporalToTemporalConfig( + name: String, + query: String, + extraCometConfigs: Map[String, String] = Map.empty) + +/** + * Benchmark to measure performance of Comet cast between temporal types. To run this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToTemporalBenchmark + * }}} + * Results will be written to + * "spark/benchmarks/CometCastTemporalToTemporalBenchmark-**results.txt". + */ +object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase { + + private val castFunctions = Seq("CAST", "TRY_CAST") + + // Date to Timestamp + private val dateToTimestampConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToTemporalConfig( + s"$castFunc Date to Timestamp", + s"SELECT $castFunc(c_date AS TIMESTAMP) FROM parquetV1Table") + + // Timestamp to Date + private val timestampToDateConfigs = for { + castFunc <- castFunctions + } yield CastTemporalToTemporalConfig( + s"$castFunc Timestamp to Date", + s"SELECT $castFunc(c_timestamp AS DATE) FROM parquetV1Table") + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val values = 1024 * 1024 // 1M rows + + // Generate DATE data for Date -> Timestamp benchmarks + runBenchmarkWithTable("Date to Timestamp casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, dates spanning ~10 years from 2020-01-01 + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT)) + END AS c_date + FROM $tbl + """)) + + dateToTimestampConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + + // Generate TIMESTAMP data for Timestamp -> Date benchmarks + runBenchmarkWithTable("Timestamp to Date casts", values) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + // Data distribution: 1% NULL, timestamps spanning ~1 year from 2020-01-01 + prepareTable( + dir, + spark.sql(s""" + SELECT CASE + WHEN value % 100 = 0 THEN NULL + ELSE TIMESTAMP_MICROS(1577836800000000 + value % 31536000000000) + END AS c_timestamp + FROM $tbl + """)) + + timestampToDateConfigs.foreach { config => + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } +}