Skip to content

Commit c1e5688

Browse files
dongjoon-hyuncloud-fan
authored andcommitted
[SPARK-22672][SQL][TEST] Refactor ORC Tests
## What changes were proposed in this pull request? Since SPARK-20682, we have two `OrcFileFormat`s. This PR refactors ORC tests with three principles (with a few exceptions) 1. Move test suite into `sql/core`. 2. Create `HiveXXX` test suite in `sql/hive` by reusing `sql/core` test suite. 3. `OrcTest` will provide common helper functions and `val orcImp: String`. **Test Suites** *Native OrcFileFormat* - org.apache.spark.sql.hive.orc - OrcFilterSuite - OrcPartitionDiscoverySuite - OrcQuerySuite - OrcSourceSuite - o.a.s.sql.hive.orc - OrcHadoopFsRelationSuite *Hive built-in OrcFileFormat* - o.a.s.sql.hive.orc - HiveOrcFilterSuite - HiveOrcPartitionDiscoverySuite - HiveOrcQuerySuite - HiveOrcSourceSuite - HiveOrcHadoopFsRelationSuite **Hierarchy** ``` OrcTest -> OrcSuite -> OrcSourceSuite -> OrcQueryTest -> OrcQuerySuite -> OrcPartitionDiscoveryTest -> OrcPartitionDiscoverySuite -> OrcFilterSuite HadoopFsRelationTest -> OrcHadoopFsRelationSuite -> HiveOrcHadoopFsRelationSuite ``` Please note the followings. - Unlike the other test suites, `OrcHadoopFsRelationSuite` doesn't inherit `OrcTest`. It is inside `sql/hive` like `ParquetHadoopFsRelationSuite` due to the dependencies and follows the existing convention to use `val dataSourceName: String` - `OrcFilterSuite`s cannot reuse test cases due to the different function signatures using Hive 1.2.1 ORC classes and Apache ORC 1.4.1 classes. ## How was this patch tested? Pass the Jenkins tests with reorganized test suites. Author: Dongjoon Hyun <[email protected]> Closes #19882 from dongjoon-hyun/SPARK-22672.
1 parent d32337b commit c1e5688

File tree

11 files changed

+971
-395
lines changed

11 files changed

+971
-395
lines changed

sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2775,32 +2775,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
27752775
}
27762776
}
27772777
}
2778-
2779-
test("SPARK-21791 ORC should support column names with dot") {
2780-
val orc = classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
2781-
withTempDir { dir =>
2782-
val path = new File(dir, "orc").getCanonicalPath
2783-
Seq(Some(1), None).toDF("col.dots").write.format(orc).save(path)
2784-
assert(spark.read.format(orc).load(path).collect().length == 2)
2785-
}
2786-
}
2787-
2788-
test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
2789-
withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "hive") {
2790-
val e = intercept[AnalysisException] {
2791-
sql("CREATE TABLE spark_20728(a INT) USING ORC")
2792-
}
2793-
assert(e.message.contains("Hive built-in ORC data source must be used with Hive support"))
2794-
}
2795-
2796-
withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
2797-
withTable("spark_20728") {
2798-
sql("CREATE TABLE spark_20728(a INT) USING ORC")
2799-
val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
2800-
case l: LogicalRelation => l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass
2801-
}
2802-
assert(fileFormat == Some(classOf[OrcFileFormat]))
2803-
}
2804-
}
2805-
}
28062778
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala renamed to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,32 @@
1515
* limitations under the License.
1616
*/
1717

18-
package org.apache.spark.sql.hive.orc
18+
package org.apache.spark.sql.execution.datasources.orc
1919

2020
import java.nio.charset.StandardCharsets
2121
import java.sql.{Date, Timestamp}
2222

2323
import scala.collection.JavaConverters._
2424

25-
import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument}
25+
import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument}
2626

27-
import org.apache.spark.sql.{Column, DataFrame, QueryTest}
27+
import org.apache.spark.sql.{Column, DataFrame}
2828
import org.apache.spark.sql.catalyst.dsl.expressions._
2929
import org.apache.spark.sql.catalyst.expressions._
3030
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
3131
import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation}
32+
import org.apache.spark.sql.test.SharedSQLContext
33+
import org.apache.spark.sql.types._
3234

3335
/**
34-
* A test suite that tests ORC filter API based filter pushdown optimization.
36+
* A test suite that tests Apache ORC filter API based filter pushdown optimization.
37+
* OrcFilterSuite and HiveOrcFilterSuite is logically duplicated to provide the same test coverage.
38+
* The difference are the packages containing 'Predicate' and 'SearchArgument' classes.
39+
* - OrcFilterSuite uses 'org.apache.orc.storage.ql.io.sarg' package.
40+
* - HiveOrcFilterSuite uses 'org.apache.hadoop.hive.ql.io.sarg' package.
3541
*/
36-
class OrcFilterSuite extends QueryTest with OrcTest {
42+
class OrcFilterSuite extends OrcTest with SharedSQLContext {
43+
3744
private def checkFilterPredicate(
3845
df: DataFrame,
3946
predicate: Predicate,
@@ -55,7 +62,7 @@ class OrcFilterSuite extends QueryTest with OrcTest {
5562
DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq)
5663
assert(selectedFilters.nonEmpty, "No filter is pushed down")
5764

58-
val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters.toArray)
65+
val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters)
5966
assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $selectedFilters")
6067
checker(maybeFilter.get)
6168
}
@@ -99,7 +106,7 @@ class OrcFilterSuite extends QueryTest with OrcTest {
99106
DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq)
100107
assert(selectedFilters.nonEmpty, "No filter is pushed down")
101108

102-
val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters.toArray)
109+
val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters)
103110
assert(maybeFilter.isEmpty, s"Could generate filter predicate for $selectedFilters")
104111
}
105112

@@ -284,40 +291,27 @@ class OrcFilterSuite extends QueryTest with OrcTest {
284291

285292
test("filter pushdown - combinations with logical operators") {
286293
withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
287-
// Because `ExpressionTree` is not accessible at Hive 1.2.x, this should be checked
288-
// in string form in order to check filter creation including logical operators
289-
// such as `and`, `or` or `not`. So, this function uses `SearchArgument.toString()`
290-
// to produce string expression and then compare it to given string expression below.
291-
// This might have to be changed after Hive version is upgraded.
292294
checkFilterPredicate(
293295
'_1.isNotNull,
294-
"""leaf-0 = (IS_NULL _1)
295-
|expr = (not leaf-0)""".stripMargin.trim
296+
"leaf-0 = (IS_NULL _1), expr = (not leaf-0)"
296297
)
297298
checkFilterPredicate(
298299
'_1 =!= 1,
299-
"""leaf-0 = (IS_NULL _1)
300-
|leaf-1 = (EQUALS _1 1)
301-
|expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim
300+
"leaf-0 = (IS_NULL _1), leaf-1 = (EQUALS _1 1), expr = (and (not leaf-0) (not leaf-1))"
302301
)
303302
checkFilterPredicate(
304303
!('_1 < 4),
305-
"""leaf-0 = (IS_NULL _1)
306-
|leaf-1 = (LESS_THAN _1 4)
307-
|expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim
304+
"leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 4), expr = (and (not leaf-0) (not leaf-1))"
308305
)
309306
checkFilterPredicate(
310307
'_1 < 2 || '_1 > 3,
311-
"""leaf-0 = (LESS_THAN _1 2)
312-
|leaf-1 = (LESS_THAN_EQUALS _1 3)
313-
|expr = (or leaf-0 (not leaf-1))""".stripMargin.trim
308+
"leaf-0 = (LESS_THAN _1 2), leaf-1 = (LESS_THAN_EQUALS _1 3), " +
309+
"expr = (or leaf-0 (not leaf-1))"
314310
)
315311
checkFilterPredicate(
316312
'_1 < 2 && '_1 > 3,
317-
"""leaf-0 = (IS_NULL _1)
318-
|leaf-1 = (LESS_THAN _1 2)
319-
|leaf-2 = (LESS_THAN_EQUALS _1 3)
320-
|expr = (and (not leaf-0) leaf-1 (not leaf-2))""".stripMargin.trim
313+
"leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 2), leaf-2 = (LESS_THAN_EQUALS _1 3), " +
314+
"expr = (and (not leaf-0) leaf-1 (not leaf-2))"
321315
)
322316
}
323317
}
@@ -344,4 +338,30 @@ class OrcFilterSuite extends QueryTest with OrcTest {
344338
checkNoFilterPredicate('_1.isNotNull)
345339
}
346340
}
341+
342+
test("SPARK-12218 Converting conjunctions into ORC SearchArguments") {
343+
import org.apache.spark.sql.sources._
344+
// The `LessThan` should be converted while the `StringContains` shouldn't
345+
val schema = new StructType(
346+
Array(
347+
StructField("a", IntegerType, nullable = true),
348+
StructField("b", StringType, nullable = true)))
349+
assertResult("leaf-0 = (LESS_THAN a 10), expr = leaf-0") {
350+
OrcFilters.createFilter(schema, Array(
351+
LessThan("a", 10),
352+
StringContains("b", "prefix")
353+
)).get.toString
354+
}
355+
356+
// The `LessThan` should be converted while the whole inner `And` shouldn't
357+
assertResult("leaf-0 = (LESS_THAN a 10), expr = leaf-0") {
358+
OrcFilters.createFilter(schema, Array(
359+
LessThan("a", 10),
360+
Not(And(
361+
GreaterThan("a", 1),
362+
StringContains("b", "prefix")
363+
))
364+
)).get.toString
365+
}
366+
}
347367
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala renamed to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala

Lines changed: 11 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -15,48 +15,21 @@
1515
* limitations under the License.
1616
*/
1717

18-
package org.apache.spark.sql.hive.orc
18+
package org.apache.spark.sql.execution.datasources.orc
1919

2020
import java.io.File
2121

22-
import scala.reflect.ClassTag
23-
import scala.reflect.runtime.universe.TypeTag
24-
25-
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
26-
import org.scalatest.BeforeAndAfterAll
27-
2822
import org.apache.spark.sql._
29-
import org.apache.spark.sql.hive.test.TestHiveSingleton
30-
import org.apache.spark.util.Utils
23+
import org.apache.spark.sql.test.SharedSQLContext
3124

3225
// The data where the partitioning key exists only in the directory structure.
3326
case class OrcParData(intField: Int, stringField: String)
3427

3528
// The data that also includes the partitioning key
3629
case class OrcParDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
3730

38-
// TODO This test suite duplicates ParquetPartitionDiscoverySuite a lot
39-
class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
40-
import spark._
41-
import spark.implicits._
42-
43-
val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultStrVal
44-
45-
def withTempDir(f: File => Unit): Unit = {
46-
val dir = Utils.createTempDir().getCanonicalFile
47-
try f(dir) finally Utils.deleteRecursively(dir)
48-
}
49-
50-
def makeOrcFile[T <: Product: ClassTag: TypeTag](
51-
data: Seq[T], path: File): Unit = {
52-
data.toDF().write.mode("overwrite").orc(path.getCanonicalPath)
53-
}
54-
55-
56-
def makeOrcFile[T <: Product: ClassTag: TypeTag](
57-
df: DataFrame, path: File): Unit = {
58-
df.write.mode("overwrite").orc(path.getCanonicalPath)
59-
}
31+
abstract class OrcPartitionDiscoveryTest extends OrcTest {
32+
val defaultPartitionName = "__HIVE_DEFAULT_PARTITION__"
6033

6134
protected def withTempTable(tableName: String)(f: => Unit): Unit = {
6235
try f finally spark.catalog.dropTempView(tableName)
@@ -90,7 +63,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
9063
makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
9164
}
9265

93-
read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
66+
spark.read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
9467

9568
withTempTable("t") {
9669
checkAnswer(
@@ -137,7 +110,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
137110
makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
138111
}
139112

140-
read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
113+
spark.read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
141114

142115
withTempTable("t") {
143116
checkAnswer(
@@ -186,8 +159,8 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
186159
makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
187160
}
188161

189-
read
190-
.option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
162+
spark.read
163+
.option("hive.exec.default.partition.name", defaultPartitionName)
191164
.orc(base.getCanonicalPath)
192165
.createOrReplaceTempView("t")
193166

@@ -228,8 +201,8 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
228201
makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
229202
}
230203

231-
read
232-
.option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
204+
spark.read
205+
.option("hive.exec.default.partition.name", defaultPartitionName)
233206
.orc(base.getCanonicalPath)
234207
.createOrReplaceTempView("t")
235208

@@ -253,3 +226,4 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
253226
}
254227
}
255228

229+
class OrcPartitionDiscoverySuite extends OrcPartitionDiscoveryTest with SharedSQLContext

0 commit comments

Comments
 (0)