[SPARK-27435][SQL] Support schema pruning in ORC V2

gengliangwang · cloud-fan · commit 4177292dcd65 · 2019-04-11T20:03:32.000+08:00
## What changes were proposed in this pull request? Currently, the optimization rule `SchemaPruning` only works for Parquet/Orc V1. We should have the same optimization in ORC V2. ## How was this patch tested? Unit test Closes apache#24338 from gengliangwang/schemaPruningForV2. Authored-by: Gengliang Wang <gengliang.wang@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LeafNode, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileTable}
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcTable
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, StructType}
 
@@ -48,7 +51,7 @@ object SchemaPruning extends Rule[LogicalPlan] {
           l @ LogicalRelation(hadoopFsRelation: HadoopFsRelation, _, _, _))
         if canPruneRelation(hadoopFsRelation) =>
         val (normalizedProjects, normalizedFilters) =
-          normalizeAttributeRefNames(l, projects, filters)
+          normalizeAttributeRefNames(l.output, projects, filters)
         val requestedRootFields = identifyRootFields(normalizedProjects, normalizedFilters)
 
         // If requestedRootFields includes a nested field, continue. Otherwise,
@@ -76,6 +79,43 @@ object SchemaPruning extends Rule[LogicalPlan] {
         } else {
           op
         }
+
+      case op @ PhysicalOperation(projects, filters,
+          d @ DataSourceV2Relation(table: FileTable, output, _)) if canPruneTable(table) =>
+        val (normalizedProjects, normalizedFilters) =
+          normalizeAttributeRefNames(output, projects, filters)
+        val requestedRootFields = identifyRootFields(normalizedProjects, normalizedFilters)
+
+        // If requestedRootFields includes a nested field, continue. Otherwise,
+        // return op
+        if (requestedRootFields.exists { root: RootField => !root.derivedFromAtt }) {
+          val dataSchema = table.dataSchema
+          val prunedDataSchema = pruneDataSchema(dataSchema, requestedRootFields)
+
+          // If the data schema is different from the pruned data schema, continue. Otherwise,
+          // return op. We effect this comparison by counting the number of "leaf" fields in
+          // each schemata, assuming the fields in prunedDataSchema are a subset of the fields
+          // in dataSchema.
+          if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
+            val prunedFileTable = table match {
+              case o: OrcTable => o.copy(userSpecifiedSchema = Some(prunedDataSchema))
+              case _ =>
+                val message = s"${table.formatName} data source doesn't support schema pruning."
+                throw new AnalysisException(message)
+            }
+
+
+            val prunedRelationV2 = buildPrunedRelationV2(d, prunedFileTable)
+            val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
+
+            buildNewProjection(normalizedProjects, normalizedFilters, prunedRelationV2,
+              projectionOverSchema)
+          } else {
+            op
+          }
+        } else {
+          op
+        }
     }
 
   /**
@@ -85,16 +125,22 @@ object SchemaPruning extends Rule[LogicalPlan] {
     fsRelation.fileFormat.isInstanceOf[ParquetFileFormat] ||
       fsRelation.fileFormat.isInstanceOf[OrcFileFormat]
 
+  /**
+   * Checks to see if the given [[FileTable]] can be pruned. Currently we support ORC v2.
+   */
+  private def canPruneTable(table: FileTable) =
+    table.isInstanceOf[OrcTable]
+
   /**
    * Normalizes the names of the attribute references in the given projects and filters to reflect
    * the names in the given logical relation. This makes it possible to compare attributes and
    * fields by name. Returns a tuple with the normalized projects and filters, respectively.
    */
   private def normalizeAttributeRefNames(
-      logicalRelation: LogicalRelation,
+      output: Seq[AttributeReference],
       projects: Seq[NamedExpression],
       filters: Seq[Expression]): (Seq[NamedExpression], Seq[Expression]) = {
-    val normalizedAttNameMap = logicalRelation.output.map(att => (att.exprId, att.name)).toMap
+    val normalizedAttNameMap = output.map(att => (att.exprId, att.name)).toMap
     val normalizedProjects = projects.map(_.transform {
       case att: AttributeReference if normalizedAttNameMap.contains(att.exprId) =>
         att.withName(normalizedAttNameMap(att.exprId))
@@ -107,11 +153,13 @@ object SchemaPruning extends Rule[LogicalPlan] {
   }
 
   /**
-   * Builds the new output [[Project]] Spark SQL operator that has the pruned output relation.
+   * Builds the new output [[Project]] Spark SQL operator that has the `leafNode`.
    */
   private def buildNewProjection(
-      projects: Seq[NamedExpression], filters: Seq[Expression], prunedRelation: LogicalRelation,
-      projectionOverSchema: ProjectionOverSchema) = {
+      projects: Seq[NamedExpression],
+      filters: Seq[Expression],
+      leafNode: LeafNode,
+      projectionOverSchema: ProjectionOverSchema): Project = {
     // Construct a new target for our projection by rewriting and
     // including the original filters where available
     val projectionChild =
@@ -120,9 +168,9 @@ object SchemaPruning extends Rule[LogicalPlan] {
           case projectionOverSchema(expr) => expr
         })
         val newFilterCondition = projectedFilters.reduce(And)
-        Filter(newFilterCondition, prunedRelation)
+        Filter(newFilterCondition, leafNode)
       } else {
-        prunedRelation
+        leafNode
       }
 
     // Construct the new projections of our Project by
@@ -145,20 +193,36 @@ object SchemaPruning extends Rule[LogicalPlan] {
   private def buildPrunedRelation(
       outputRelation: LogicalRelation,
       prunedBaseRelation: HadoopFsRelation) = {
+    val prunedOutput = getPrunedOutput(outputRelation.output, prunedBaseRelation.schema)
+    outputRelation.copy(relation = prunedBaseRelation, output = prunedOutput)
+  }
+
+  /**
+   * Builds a pruned data source V2 relation from the output of the relation and the schema
+   * of the pruned [[FileTable]].
+   */
+  private def buildPrunedRelationV2(
+      outputRelation: DataSourceV2Relation,
+      prunedFileTable: FileTable) = {
+    val prunedOutput = getPrunedOutput(outputRelation.output, prunedFileTable.schema)
+    outputRelation.copy(table = prunedFileTable, output = prunedOutput)
+  }
+
+  // Prune the given output to make it consistent with `requiredSchema`.
+  private def getPrunedOutput(
+      output: Seq[AttributeReference],
+      requiredSchema: StructType): Seq[AttributeReference] = {
     // We need to replace the expression ids of the pruned relation output attributes
     // with the expression ids of the original relation output attributes so that
     // references to the original relation's output are not broken
-    val outputIdMap = outputRelation.output.map(att => (att.name, att.exprId)).toMap
-    val prunedRelationOutput =
-      prunedBaseRelation
-        .schema
-        .toAttributes
-        .map {
-          case att if outputIdMap.contains(att.name) =>
-            att.withExprId(outputIdMap(att.name))
-          case att => att
-        }
-    outputRelation.copy(relation = prunedBaseRelation, output = prunedRelationOutput)
+    val outputIdMap = output.map(att => (att.name, att.exprId)).toMap
+    requiredSchema
+      .toAttributes
+      .map {
+        case att if outputIdMap.contains(att.name) =>
+          att.withExprId(outputIdMap(att.name))
+        case att => att
+      }
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
@@ -407,7 +407,7 @@ abstract class SchemaPruningSuite
     }
   }
 
-  private val schemaEquality = new Equality[StructType] {
+  protected val schemaEquality = new Equality[StructType] {
     override def areEqual(a: StructType, b: Any): Boolean =
       b match {
         case otherType: StructType => a.sameType(otherType)
@@ -422,7 +422,7 @@ abstract class SchemaPruningSuite
     df.collect()
   }
 
-  private def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = {
+  protected def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = {
     val fileSourceScanSchemata =
       df.queryExecution.executedPlan.collect {
         case scan: FileSourceScanExec => scan.requiredSchema
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1SchemaPruningSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.SparkConf
 import org.apache.spark.sql.execution.datasources.SchemaPruningSuite
 import org.apache.spark.sql.internal.SQLConf
 
-class OrcSchemaPruningSuite extends SchemaPruningSuite {
+class OrcV1SchemaPruningSuite extends SchemaPruningSuite {
   override protected val dataSourceName: String = "orc"
   override protected val vectorizedReaderEnabledKey: String =
     SQLConf.ORC_VECTORIZED_READER_ENABLED.key
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.execution.datasources.SchemaPruningSuite
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
+import org.apache.spark.sql.internal.SQLConf
+
+class OrcV2SchemaPruningSuite extends SchemaPruningSuite {
+  override protected val dataSourceName: String = "orc"
+  override protected val vectorizedReaderEnabledKey: String =
+    SQLConf.ORC_VECTORIZED_READER_ENABLED.key
+
+  override protected def sparkConf: SparkConf =
+    super
+      .sparkConf
+      .set(SQLConf.USE_V1_SOURCE_READER_LIST, "")
+
+  override def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = {
+    val fileSourceScanSchemata =
+      df.queryExecution.executedPlan.collect {
+        case BatchScanExec(_, scan: OrcScan) => scan.readDataSchema
+      }
+    assert(fileSourceScanSchemata.size === expectedSchemaCatalogStrings.size,
+      s"Found ${fileSourceScanSchemata.size} file sources in dataframe, " +
+        s"but expected $expectedSchemaCatalogStrings")
+    fileSourceScanSchemata.zip(expectedSchemaCatalogStrings).foreach {
+      case (scanSchema, expectedScanSchemaCatalogString) =>
+        val expectedScanSchema = CatalystSqlParser.parseDataType(expectedScanSchemaCatalogString)
+        implicit val equality = schemaEquality
+        assert(scanSchema === expectedScanSchema)
+    }
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -407,7 +407,7 @@ abstract class SchemaPruningSuite`
`407`	`407`	`}`
`408`	`408`	`}`
`409`	`409`
`410`		`- private val schemaEquality = new Equality[StructType] {`
	`410`	`+ protected val schemaEquality = new Equality[StructType] {`
`411`	`411`	`override def areEqual(a: StructType, b: Any): Boolean =`
`412`	`412`	`b match {`
`413`	`413`	`case otherType: StructType => a.sameType(otherType)`
`@@ -422,7 +422,7 @@ abstract class SchemaPruningSuite`
`422`	`422`	`df.collect()`
`423`	`423`	`}`
`424`	`424`
`425`		`- private def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = {`
	`425`	`+ protected def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = {`
`426`	`426`	`val fileSourceScanSchemata =`
`427`	`427`	`df.queryExecution.executedPlan.collect {`
`428`	`428`	`case scan: FileSourceScanExec => scan.requiredSchema`