|
17 | 17 |
|
18 | 18 | package org.apache.spark.sql.execution.datasources.v2
|
19 | 19 |
|
20 |
| -import org.apache.spark.sql.{execution, Strategy} |
21 |
| -import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet} |
| 20 | +import scala.collection.mutable |
| 21 | + |
| 22 | +import org.apache.spark.sql.{sources, Strategy} |
| 23 | +import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression} |
22 | 24 | import org.apache.spark.sql.catalyst.planning.PhysicalOperation
|
23 | 25 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
24 |
| -import org.apache.spark.sql.execution.SparkPlan |
| 26 | +import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} |
| 27 | +import org.apache.spark.sql.execution.datasources.DataSourceStrategy |
25 | 28 | import org.apache.spark.sql.execution.streaming.continuous.{WriteToContinuousDataSource, WriteToContinuousDataSourceExec}
|
| 29 | +import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownCatalystFilters, SupportsPushDownFilters, SupportsPushDownRequiredColumns} |
26 | 30 |
|
27 | 31 | object DataSourceV2Strategy extends Strategy {
|
28 |
| - override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { |
29 |
| - case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => |
30 |
| - val projectSet = AttributeSet(project.flatMap(_.references)) |
31 |
| - val filterSet = AttributeSet(filters.flatMap(_.references)) |
32 |
| - |
33 |
| - val projection = if (filterSet.subsetOf(projectSet) && |
34 |
| - AttributeSet(relation.output) == projectSet) { |
35 |
| - // When the required projection contains all of the filter columns and column pruning alone |
36 |
| - // can produce the required projection, push the required projection. |
37 |
| - // A final projection may still be needed if the data source produces a different column |
38 |
| - // order or if it cannot prune all of the nested columns. |
39 |
| - relation.output |
40 |
| - } else { |
41 |
| - // When there are filter columns not already in the required projection or when the required |
42 |
| - // projection is more complicated than column pruning, base column pruning on the set of |
43 |
| - // all columns needed by both. |
44 |
| - (projectSet ++ filterSet).toSeq |
45 |
| - } |
46 | 32 |
|
47 |
| - val reader = relation.newReader |
| 33 | + /** |
| 34 | + * Pushes down filters to the data source reader |
| 35 | + * |
| 36 | + * @return pushed filter and post-scan filters. |
| 37 | + */ |
| 38 | + private def pushFilters( |
| 39 | + reader: DataSourceReader, |
| 40 | + filters: Seq[Expression]): (Seq[Expression], Seq[Expression]) = { |
| 41 | + reader match { |
| 42 | + case r: SupportsPushDownCatalystFilters => |
| 43 | + val postScanFilters = r.pushCatalystFilters(filters.toArray) |
| 44 | + val pushedFilters = r.pushedCatalystFilters() |
| 45 | + (pushedFilters, postScanFilters) |
| 46 | + |
| 47 | + case r: SupportsPushDownFilters => |
| 48 | + // A map from translated data source filters to original catalyst filter expressions. |
| 49 | + val translatedFilterToExpr = mutable.HashMap.empty[sources.Filter, Expression] |
| 50 | + // Catalyst filter expression that can't be translated to data source filters. |
| 51 | + val untranslatableExprs = mutable.ArrayBuffer.empty[Expression] |
| 52 | + |
| 53 | + for (filterExpr <- filters) { |
| 54 | + val translated = DataSourceStrategy.translateFilter(filterExpr) |
| 55 | + if (translated.isDefined) { |
| 56 | + translatedFilterToExpr(translated.get) = filterExpr |
| 57 | + } else { |
| 58 | + untranslatableExprs += filterExpr |
| 59 | + } |
| 60 | + } |
| 61 | + |
| 62 | + // Data source filters that need to be evaluated again after scanning. which means |
| 63 | + // the data source cannot guarantee the rows returned can pass these filters. |
| 64 | + // As a result we must return it so Spark can plan an extra filter operator. |
| 65 | + val postScanFilters = r.pushFilters(translatedFilterToExpr.keys.toArray) |
| 66 | + .map(translatedFilterToExpr) |
| 67 | + // The filters which are marked as pushed to this data source |
| 68 | + val pushedFilters = r.pushedFilters().map(translatedFilterToExpr) |
| 69 | + (pushedFilters, untranslatableExprs ++ postScanFilters) |
| 70 | + |
| 71 | + case _ => (Nil, filters) |
| 72 | + } |
| 73 | + } |
48 | 74 |
|
49 |
| - val output = DataSourceV2Relation.pushRequiredColumns(relation, reader, |
50 |
| - projection.asInstanceOf[Seq[AttributeReference]].toStructType) |
| 75 | + /** |
| 76 | + * Applies column pruning to the data source, w.r.t. the references of the given expressions. |
| 77 | + * |
| 78 | + * @return new output attributes after column pruning. |
| 79 | + */ |
| 80 | + // TODO: nested column pruning. |
| 81 | + private def pruneColumns( |
| 82 | + reader: DataSourceReader, |
| 83 | + relation: DataSourceV2Relation, |
| 84 | + exprs: Seq[Expression]): Seq[AttributeReference] = { |
| 85 | + reader match { |
| 86 | + case r: SupportsPushDownRequiredColumns => |
| 87 | + val requiredColumns = AttributeSet(exprs.flatMap(_.references)) |
| 88 | + val neededOutput = relation.output.filter(requiredColumns.contains) |
| 89 | + if (neededOutput != relation.output) { |
| 90 | + r.pruneColumns(neededOutput.toStructType) |
| 91 | + val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap |
| 92 | + r.readSchema().toAttributes.map { |
| 93 | + // We have to keep the attribute id during transformation. |
| 94 | + a => a.withExprId(nameToAttr(a.name).exprId) |
| 95 | + } |
| 96 | + } else { |
| 97 | + relation.output |
| 98 | + } |
| 99 | + |
| 100 | + case _ => relation.output |
| 101 | + } |
| 102 | + } |
51 | 103 |
|
52 |
| - val (postScanFilters, pushedFilters) = DataSourceV2Relation.pushFilters(reader, filters) |
53 | 104 |
|
54 |
| - logInfo(s"Post-Scan Filters: ${postScanFilters.mkString(",")}") |
55 |
| - logInfo(s"Pushed Filters: ${pushedFilters.mkString(", ")}") |
| 105 | + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { |
| 106 | + case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => |
| 107 | + val reader = relation.newReader() |
| 108 | + // `pushedFilters` will be pushed down and evaluated in the underlying data sources. |
| 109 | + // `postScanFilters` need to be evaluated after the scan. |
| 110 | + // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. |
| 111 | + val (pushedFilters, postScanFilters) = pushFilters(reader, filters) |
| 112 | + val output = pruneColumns(reader, relation, project ++ postScanFilters) |
| 113 | + logInfo( |
| 114 | + s""" |
| 115 | + |Pushing operators to ${relation.source.getClass} |
| 116 | + |Pushed Filters: ${pushedFilters.mkString(", ")} |
| 117 | + |Post-Scan Filters: ${postScanFilters.mkString(",")} |
| 118 | + |Output: ${output.mkString(", ")} |
| 119 | + """.stripMargin) |
56 | 120 |
|
57 | 121 | val scan = DataSourceV2ScanExec(
|
58 | 122 | output, relation.source, relation.options, pushedFilters, reader)
|
59 | 123 |
|
60 |
| - val filter = postScanFilters.reduceLeftOption(And) |
61 |
| - val withFilter = filter.map(execution.FilterExec(_, scan)).getOrElse(scan) |
| 124 | + val filterCondition = postScanFilters.reduceLeftOption(And) |
| 125 | + val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan) |
62 | 126 |
|
63 | 127 | val withProjection = if (withFilter.output != project) {
|
64 |
| - execution.ProjectExec(project, withFilter) |
| 128 | + ProjectExec(project, withFilter) |
65 | 129 | } else {
|
66 | 130 | withFilter
|
67 | 131 | }
|
|
0 commit comments