Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit b9b54b1

Browse files
sarutakgatorsmile
authored andcommitted
[SPARK-21368][SQL] TPCDSQueryBenchmark can't refer query files.
## What changes were proposed in this pull request? TPCDSQueryBenchmark packaged into a jar doesn't work with spark-submit. It's because of the failure of reference query files in the jar file. ## How was this patch tested? Ran the benchmark. Author: sarutak <[email protected]> Author: Kousuke Saruta <[email protected]> Closes apache#18592 from sarutak/fix-tpcds-benchmark.
1 parent 720c94f commit b9b54b1

File tree

2 files changed

+74
-14
lines changed

2 files changed

+74
-14
lines changed

sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717

1818
package org.apache.spark.sql.execution.benchmark
1919

20-
import java.io.File
21-
2220
import org.apache.spark.SparkConf
2321
import org.apache.spark.sql.SparkSession
2422
import org.apache.spark.sql.catalyst.TableIdentifier
@@ -31,7 +29,7 @@ import org.apache.spark.util.Benchmark
3129
/**
3230
* Benchmark to measure TPCDS query performance.
3331
* To run this:
34-
* spark-submit --class <this class> --jars <spark sql test jar>
32+
* spark-submit --class <this class> <spark sql test jar> <TPCDS data location>
3533
*/
3634
object TPCDSQueryBenchmark {
3735
val conf =
@@ -61,12 +59,10 @@ object TPCDSQueryBenchmark {
6159
}
6260

6361
def tpcdsAll(dataLocation: String, queries: Seq[String]): Unit = {
64-
require(dataLocation.nonEmpty,
65-
"please modify the value of dataLocation to point to your local TPCDS data")
6662
val tableSizes = setupTables(dataLocation)
6763
queries.foreach { name =>
68-
val queryString = fileToString(new File(Thread.currentThread().getContextClassLoader
69-
.getResource(s"tpcds/$name.sql").getFile))
64+
val queryString = resourceToString(s"tpcds/$name.sql",
65+
classLoader = Thread.currentThread().getContextClassLoader)
7066

7167
// This is an indirect hack to estimate the size of each query's input by traversing the
7268
// logical plan and adding up the sizes of all tables that appear in the plan. Note that this
@@ -99,6 +95,7 @@ object TPCDSQueryBenchmark {
9995
}
10096

10197
def main(args: Array[String]): Unit = {
98+
val benchmarkArgs = new TPCDSQueryBenchmarkArguments(args)
10299

103100
// List of all TPC-DS queries
104101
val tpcdsQueries = Seq(
@@ -113,12 +110,6 @@ object TPCDSQueryBenchmark {
113110
"q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
114111
"q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
115112

116-
// In order to run this benchmark, please follow the instructions at
117-
// https://github.com/databricks/spark-sql-perf/blob/master/README.md to generate the TPCDS data
118-
// locally (preferably with a scale factor of 5 for benchmarking). Thereafter, the value of
119-
// dataLocation below needs to be set to the location where the generated data is stored.
120-
val dataLocation = ""
121-
122-
tpcdsAll(dataLocation, queries = tpcdsQueries)
113+
tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries)
123114
}
124115
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.execution.benchmark
19+
20+
class TPCDSQueryBenchmarkArguments(val args: Array[String]) {
21+
var dataLocation: String = null
22+
23+
parseArgs(args.toList)
24+
validateArguments()
25+
26+
private def parseArgs(inputArgs: List[String]): Unit = {
27+
var args = inputArgs
28+
29+
while(args.nonEmpty) {
30+
args match {
31+
case ("--data-location") :: value :: tail =>
32+
dataLocation = value
33+
args = tail
34+
35+
case _ =>
36+
// scalastyle:off println
37+
System.err.println("Unknown/unsupported param " + args)
38+
// scalastyle:on println
39+
printUsageAndExit(1)
40+
}
41+
}
42+
}
43+
44+
private def printUsageAndExit(exitCode: Int): Unit = {
45+
// scalastyle:off
46+
System.err.println("""
47+
|Usage: spark-submit --class <this class> <spark sql test jar> [Options]
48+
|Options:
49+
| --data-location Path to TPCDS data
50+
|
51+
|------------------------------------------------------------------------------------------------------------------
52+
|In order to run this benchmark, please follow the instructions at
53+
|https://github.com/databricks/spark-sql-perf/blob/master/README.md
54+
|to generate the TPCDS data locally (preferably with a scale factor of 5 for benchmarking).
55+
|Thereafter, the value of <TPCDS data location> needs to be set to the location where the generated data is stored.
56+
""".stripMargin)
57+
// scalastyle:on
58+
System.exit(exitCode)
59+
}
60+
61+
private def validateArguments(): Unit = {
62+
if (dataLocation == null) {
63+
// scalastyle:off println
64+
System.err.println("Must specify a data location")
65+
// scalastyle:on println
66+
printUsageAndExit(-1)
67+
}
68+
}
69+
}

0 commit comments

Comments
 (0)