[SPARK-25589][SQL][TEST] Add BloomFilterBenchmark

dongjoon-hyun · dongjoon-hyun · commit 1a5d83bed8a6 · 2018-10-03T04:14:07.000-07:00
## What changes were proposed in this pull request? This PR aims to add `BloomFilterBenchmark`. For ORC data source, Apache Spark has been supporting for a long time. For Parquet data source, it's expected to be added with next Parquet release update. ## How was this patch tested? Manual. ```scala SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.BloomFilterBenchmark" ``` Closes apache#22605 from dongjoon-hyun/SPARK-25589. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt
@@ -0,0 +1,24 @@
+================================================================================================
+ORC Write
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Write 100M rows:                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Without bloom filter                        16765 / 17587          6.0         167.7       1.0X
+With bloom filter                           20060 / 20626          5.0         200.6       0.8X
+
+
+================================================================================================
+ORC Read
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Read a row from 100M rows:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Without bloom filter                          1857 / 1904         53.9          18.6       1.0X
+With bloom filter                             1399 / 1437         71.5          14.0       1.3X
+
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import scala.util.Random
+
+import org.apache.spark.benchmark.Benchmark
+
+/**
+ * Benchmark to measure read performance with Bloom filters.
+ *
+ * Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes
+ * available.
+ *
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt: bin/spark-submit --class <this class> <spark sql test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
+ *      Results will be written to "benchmarks/BloomFilterBenchmark-results.txt".
+ * }}}
+ */
+object BloomFilterBenchmark extends SqlBasedBenchmark {
+  import spark.implicits._
+
+  private val scaleFactor = 100
+  private val N = scaleFactor * 1000 * 1000
+  private val df = spark.range(N).map(_ => Random.nextInt)
+
+  private def writeBenchmark(): Unit = {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      runBenchmark(s"ORC Write") {
+        val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output)
+        benchmark.addCase("Without bloom filter") { _ =>
+          df.write.mode("overwrite").orc(path + "/withoutBF")
+        }
+        benchmark.addCase("With bloom filter") { _ =>
+          df.write.mode("overwrite")
+            .option("orc.bloom.filter.columns", "value").orc(path + "/withBF")
+        }
+        benchmark.run()
+      }
+    }
+  }
+
+  private def readBenchmark(): Unit = {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      df.write.orc(path + "/withoutBF")
+      df.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF")
+
+      runBenchmark(s"ORC Read") {
+        val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output)
+        benchmark.addCase("Without bloom filter") { _ =>
+          spark.read.orc(path + "/withoutBF").where("value = 0").count
+        }
+        benchmark.addCase("With bloom filter") { _ =>
+          spark.read.orc(path + "/withBF").where("value = 0").count
+        }
+        benchmark.run()
+      }
+    }
+  }
+
+  override def runBenchmarkSuite(): Unit = {
+    writeBenchmark()
+    readBenchmark()
+  }
+}