[SPARK-22267][SQL][TEST] Spark SQL incorrectly reads ORC files when column order is different

dongjoon-hyun · cloud-fan · commit 6cc7021a40b6 · 2017-12-11T21:52:57.000+08:00
## What changes were proposed in this pull request? Until 2.2.1, with the default configuration, Apache Spark returns incorrect results when ORC file schema is different from metastore schema order. This is due to Hive 1.2.1 library and some issues on `convertMetastoreOrc` option. ```scala scala> Seq(1 -> 2).toDF("c1", "c2").write.format("orc").mode("overwrite").save("/tmp/o") scala> sql("CREATE EXTERNAL TABLE o(c2 INT, c1 INT) STORED AS orc LOCATION '/tmp/o'") scala> spark.table("o").show // This is wrong. +---+---+ | c2| c1| +---+---+ | 1| 2| +---+---+ scala> spark.read.orc("/tmp/o").show // This is correct. +---+---+ | c1| c2| +---+---+ | 1| 2| +---+---+ ``` After [SPARK-22279](#19499), the default configuration doesn't have this bug. Although Hive 1.2.1 library code path still has the problem, we had better have a test coverage on what we have now in order to prevent future regression on it. ## How was this patch tested? Pass the Jenkins with a newly added test test. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #19928 from dongjoon-hyun/SPARK-22267.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2153,4 +2153,23 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("SPARK-22267 Spark SQL incorrectly reads ORC files when column order is different") {
+    Seq("native", "hive").foreach { orcImpl =>
+      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
+        withTempPath { f =>
+          val path = f.getCanonicalPath
+          Seq(1 -> 2).toDF("c1", "c2").write.orc(path)
+          checkAnswer(spark.read.orc(path), Row(1, 2))
+
+          withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0
+            withTable("t") {
+              sql(s"CREATE EXTERNAL TABLE t(c2 INT, c1 INT) STORED AS ORC LOCATION '$path'")
+              checkAnswer(spark.table("t"), Row(2, 1))
+            }
+          }
+        }
+      }
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -2153,4 +2153,23 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {`
`2153`	`2153`	`}`
`2154`	`2154`	`}`
`2155`	`2155`	`}`
	`2156`	`+`
	`2157`	`+ test("SPARK-22267 Spark SQL incorrectly reads ORC files when column order is different") {`
	`2158`	`+ Seq("native", "hive").foreach { orcImpl =>`
	`2159`	`+ withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {`
	`2160`	`+ withTempPath { f =>`
	`2161`	`+ val path = f.getCanonicalPath`
	`2162`	`+ Seq(1 -> 2).toDF("c1", "c2").write.orc(path)`
	`2163`	`+ checkAnswer(spark.read.orc(path), Row(1, 2))`
	`2164`	`+`
	`2165`	`+ withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0`
	`2166`	`+ withTable("t") {`
	`2167`	`+ sql(s"CREATE EXTERNAL TABLE t(c2 INT, c1 INT) STORED AS ORC LOCATION '$path'")`
	`2168`	`+ checkAnswer(spark.table("t"), Row(2, 1))`
	`2169`	`+ }`
	`2170`	`+ }`
	`2171`	`+ }`
	`2172`	`+ }`
	`2173`	`+ }`
	`2174`	`+ }`
`2156`	`2175`	`}`