@@ -39,7 +39,6 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
39
39
import org .apache .spark .sql .catalyst .encoders .ExpressionEncoder
40
40
import org .apache .spark .sql .catalyst .expressions ._
41
41
import org .apache .spark .sql .catalyst .expressions .Literal .{FalseLiteral , TrueLiteral }
42
- import org .apache .spark .sql .catalyst .expressions .objects .InvokeLike
43
42
import org .apache .spark .sql .catalyst .util .TypeUtils
44
43
import org .apache .spark .sql .execution .InSubqueryExec
45
44
import org .apache .spark .sql .expressions .SparkUserDefinedFunction
@@ -597,6 +596,93 @@ trait DataSkippingReaderBase
597
596
maxExpr : Expression ,
598
597
nullCountExpr : Expression )
599
598
599
+ /**
600
+ * Whitelist of expressions that can be rewritten as partition-like.
601
+ * Set to a finite list to avoid having to silently introducing correctness issues as new
602
+ * expressions that violate the assumptions of partition-like skipping are introduced.
603
+ * There's no need to include [[SkippingEligibleColumn ]] here - it's already handled explicitly.
604
+ *
605
+ * The following expressions have been intentionally excluded from the whitelist of supported
606
+ * expressions:
607
+ * - [[AttributeReference ]]: Any non-skipping eligible column references can't be rewritten as
608
+ * partition-like.
609
+ * - Any nondeterministic expression: The value returned while skipping might be different when
610
+ * the expression is evaluated again. For example, rand() > 0.5 would return ~25% of records
611
+ * if used in data skipping, while the user would expect ~50% of records to be returned.
612
+ * - [[UserDefinedExpression ]]: Often nondeterministic, and may have side effects when executed
613
+ * multiple times.
614
+ * - [[RegExpReplace ]], [[RegExpExtractBase ]], [[Like ]], [[MultiLikeBase ]], [[InvokeLike ]], and
615
+ * [[JsonToStructs ]]: These expressions might be very expensive to evalute more than once.
616
+ */
617
+ private def shouldRewriteAsPartitionLike (expr : Expression ): Boolean = expr match {
618
+ // Expressions supported by traditional data skipping.
619
+ // Boolean operators. AND is explicitly handled by the caller.
620
+ case _ : Not | _ : Or => true
621
+ // Comparison operators.
622
+ case _ : EqualNullSafe | _ : EqualTo | _ : GreaterThan | _ : GreaterThanOrEqual | _ : IsNull |
623
+ _ : IsNotNull | _ : LessThan | _ : LessThanOrEqual => true
624
+ // String and set operators. InSubqueryExec is explicitly handled by the caller.
625
+ case _ : In | _ : InSet | _ : StartsWith => true
626
+ case _ : Literal => true
627
+
628
+ // Expressions only supported for partition-like data skipping.
629
+ // Date and time conversions.
630
+ case _ : ConvertTimezone | _ : DateFormatClass | _ : Extract | _ : GetDateField |
631
+ _ : GetTimeField | _ : IntegralToTimestampBase | _ : MakeDate | _ : MakeTimestamp |
632
+ _ : ParseToDate | _ : ParseToTimestamp | _ : ToTimestamp | _ : TruncDate |
633
+ _ : TruncTimestamp | _ : UTCTimestamp => true
634
+ // Unix date and timestamp conversions.
635
+ case _ : DateFromUnixDate | _ : FromUnixTime | _ : TimestampToLongBase | _ : ToUnixTimestamp |
636
+ _ : UnixDate | _ : UnixTime | _ : UnixTimestamp => true
637
+ // Date and time arithmetic.
638
+ case _ : AddMonthsBase | _ : DateAdd | _ : DateAddInterval | _ : DateDiff | _ : DateSub |
639
+ _ : DatetimeSub | _ : LastDay | _ : MonthsBetween | _ : NextDay | _ : SubtractDates |
640
+ _ : SubtractTimestamps | _ : TimeAdd | _ : TimestampAdd | _ : TimestampAddYMInterval |
641
+ _ : TimestampDiff | _ : TruncInstant => true
642
+ // String expressions.
643
+ case _ : Base64 | _ : BitLength | _ : Chr | _ : ConcatWs | _ : Decode | _ : Elt | _ : Empty2Null |
644
+ _ : Encode | _ : FormatNumber | _ : FormatString | _ : ILike | _ : InitCap | _ : Left |
645
+ _ : Length | _ : Levenshtein | _ : Luhncheck | _ : OctetLength | _ : Overlay | _ : Right |
646
+ _ : Sentences | _ : SoundEx | _ : SplitPart | _ : String2StringExpression |
647
+ _ : String2TrimExpression | _ : StringDecode | _ : StringInstr | _ : StringLPad |
648
+ _ : StringLocate | _ : StringPredicate | _ : StringRPad | _ : StringRepeat |
649
+ _ : StringReplace | _ : StringSpace | _ : StringSplit | _ : StringSplitSQL |
650
+ _ : StringTranslate | _ : StringTrimBoth | _ : Substring | _ : SubstringIndex | _ : ToBinary |
651
+ _ : TryToBinary | _ : UnBase64 => true
652
+ // Arithmetic expressions.
653
+ case _ : Abs | _ : BinaryArithmetic | _ : Greatest | _ : Least | _ : UnaryMinus |
654
+ _ : UnaryPositive => true
655
+ // Array expressions.
656
+ case _ : ArrayBinaryLike | _ : ArrayCompact | _ : ArrayContains | _ : ArrayInsert | _ : ArrayJoin |
657
+ _ : ArrayMax | _ : ArrayMin | _ : ArrayPosition | _ : ArrayRemove | _ : ArrayRepeat |
658
+ _ : ArraySetLike | _ : ArraySize | _ : ArraysZip |
659
+ _ : BinaryArrayExpressionWithImplicitCast | _ : Concat | _ : CreateArray | _ : ElementAt |
660
+ _ : Flatten | _ : Get | _ : GetArrayItem | _ : GetArrayStructFields |
661
+ _ : Reverse | _ : Sequence | _ : Size | _ : Slice | _ : SortArray | _ : TryElementAt => true
662
+ // Map expressions.
663
+ case _ : CreateMap | _ : GetMapValue | _ : MapConcat | _ : MapContainsKey | _ : MapEntries |
664
+ _ : MapFromArrays | _ : MapFromEntries | _ : MapKeys | _ : MapValues | _ : StringToMap => true
665
+ // Struct expressions.
666
+ case _ : CreateNamedStruct | _ : DropField | _ : GetStructField | _ : UpdateFields |
667
+ _ : WithField => true
668
+ // Hash expressions.
669
+ case _ : Crc32 | _ : HashExpression [_] | _ : Md5 | _ : Sha1 | _ : Sha2 => true
670
+ // URL expressions.
671
+ case _ : ParseUrl | _ : UrlDecode | _ : UrlEncode => true
672
+ // NULL expressions.
673
+ case _ : AtLeastNNonNulls | _ : Coalesce | _ : IsNaN | _ : NaNvl | _ : NullIf | _ : Nvl |
674
+ _ : Nvl2 => true
675
+ // Cast expressions.
676
+ case _ : Cast | _ : UpCast => true
677
+ // Conditional expressions.
678
+ case _ : If | _ : CaseWhen => true
679
+ case _ : Alias => true
680
+
681
+ // Don't attempt partition-like skipping on any unknown expressions: there's no way to
682
+ // guarantee it's safe to do so.
683
+ case _ => false
684
+ }
685
+
600
686
/**
601
687
* Rewrites the references in an expression to point to the collected stats over that column
602
688
* (if possible).
@@ -650,10 +736,10 @@ trait DataSkippingReaderBase
650
736
}
651
737
// For other attribute references, we can't safely rewrite the expression.
652
738
case SkippingEligibleColumn (_, _) => None
653
- // Don't attempt data skipping on a nondeterministic expression, since the value returned
654
- // might be different when executed twice on the same input.
655
- // For example, rand() > 0.5 would return ~25% of records if used in data skipping, while the
656
- // user would expect ~50% of records to be returned .
739
+ // Explicitly disallow rewriting nondeterministic expressions. Even though this check isn't
740
+ // strictly necessary (there shouldn't be any nondeterministic expressions in the whitelist),
741
+ // defensively keep it due to the extreme risk of correctness issues if any nondeterministic
742
+ // expressions sneak into the whitelist .
657
743
case other if ! other.deterministic => None
658
744
// Inline subquery results to support InSet. The subquery should generally have already been
659
745
// evaluated.
@@ -667,13 +753,6 @@ trait DataSkippingReaderBase
667
753
Some (InSet (rewrittenChildren, possiblyNullValues.toSet), referencedStats)
668
754
}
669
755
}
670
- // Don't allow rewriting UDFs - even if deterministic, UDFs might have some unexpected
671
- // side effects when executed twice.
672
- case _ : UserDefinedExpression => None
673
- // Don't attempt to rewrite expressions might be extremely expensive to invoke twice.
674
- case _ : RegExpReplace | _ : RegExpExtractBase | _ : Like | _ : MultiLikeBase => None
675
- case _ : InvokeLike => None
676
- case _ : JsonToStructs => None
677
756
// Pushdown NOT through OR - we prefer AND to OR because AND can tolerate one branch not being
678
757
// rewriteable.
679
758
case Not (Or (e1, e2)) =>
@@ -688,14 +767,16 @@ trait DataSkippingReaderBase
688
767
Some ((And (newLeft, newRight), statsLeft ++ statsRight))
689
768
case _ => leftResult.orElse(rightResult)
690
769
}
691
- // For all other expressions, recursively rewrite the children.
692
- case other =>
770
+ // For all other eligible expressions, recursively rewrite the children.
771
+ case other if shouldRewriteAsPartitionLike(other) =>
693
772
val childResults = other.children.map(
694
773
rewriteDataFiltersAsPartitionLikeInternal(_, clusteringColumnPaths))
695
774
Option .whenNot (childResults.exists(_.isEmpty)) {
696
775
val (children, stats) = childResults.map(_.get).unzip
697
776
(other.withNewChildren(children), stats.flatten.toSet)
698
777
}
778
+ // Don't attempt rewriting any non-whitelisted expressions.
779
+ case _ => None
699
780
}
700
781
701
782
/**
0 commit comments