|
11 | 11 | import org.elasticsearch.common.io.stream.NamedWriteableRegistry; |
12 | 12 | import org.elasticsearch.common.io.stream.StreamInput; |
13 | 13 | import org.elasticsearch.common.io.stream.StreamOutput; |
| 14 | +import org.elasticsearch.compute.data.Block; |
| 15 | +import org.elasticsearch.compute.data.Vector; |
14 | 16 | import org.elasticsearch.compute.operator.EvalOperator; |
15 | 17 | import org.elasticsearch.xpack.esql.EsqlIllegalArgumentException; |
16 | 18 | import org.elasticsearch.xpack.esql.core.expression.Expression; |
|
50 | 52 | import static org.elasticsearch.xpack.esql.core.type.DataType.VERSION; |
51 | 53 | import static org.elasticsearch.xpack.esql.core.util.StringUtils.ordinal; |
52 | 54 |
|
| 55 | +/** |
| 56 | + * The {@code IN} operator. |
| 57 | + * <p> |
| 58 | + * This function has quite "unique" null handling rules around {@code null} and multivalued |
| 59 | + * fields. The {@code null} rules are inspired by PostgreSQL, and, presumably, every other |
| 60 | + * SQL implementation. The multivalue rules are pretty much an extension of the "multivalued |
| 61 | + * fields are like null in scalars" rule. Here's some examples: |
| 62 | + * </p> |
| 63 | + * <ul> |
| 64 | + * <li>{@code 'x' IN ('a', 'b', 'c')} => @{code false}</li> |
| 65 | + * <li>{@code 'x' IN ('a', 'x', 'c')} => @{code true}</li> |
| 66 | + * <li>{@code null IN ('a', 'b', 'c')} => @{code null}</li> |
| 67 | + * <li>{@code ['x', 'y'] IN ('a', 'b', 'c')} => @{code null} and a warning</li> |
| 68 | + * <li>{@code 'x' IN ('a', null, 'c')} => @{code null}</li> |
| 69 | + * <li>{@code 'x' IN ('x', null, 'c')} => @{code true}</li> |
| 70 | + * <li>{@code 'x' IN ('x', ['a', 'b'], 'c')} => @{code true} and a warning</li> |
| 71 | + * <li>{@code 'x' IN ('a', ['a', 'b'], 'c')} => @{code false} and a warning</li> |
| 72 | + * </ul> |
| 73 | + * <p> |
| 74 | + * And here's the decision tree for {@code WHERE x IN (a, b, c)}: |
| 75 | + * </p> |
| 76 | + * <ol> |
| 77 | + * <li>{@code x IS NULL} => return {@code null}</li> |
| 78 | + * <li>{@code MV_COUNT(x) > 1} => emit a warning and return {@code null}</li> |
| 79 | + * <li>{@code a IS NULL AND b IS NULL AND c IS NULL} => return {@code null}</li> |
| 80 | + * <li>{@code MV_COUNT(a) > 1 OR MV_COUNT(b) > 1 OR MV_COUNT(c) > 1} => emit a warning and continue</li> |
| 81 | + * <li>{@code MV_COUNT(a) > 1 AND MV_COUNT(b) > 1 AND MV_COUNT(c) > 1} => return {@code null}</li> |
| 82 | + * <li>{@code x == a OR x == b OR x == c} => return {@code true}</li> |
| 83 | + * <li>{@code a IS NULL OR b IS NULL OR c IS NULL} => return {@code null}</li> |
| 84 | + * <li>{@code else} => {@code false}</li> |
| 85 | + * </ol> |
| 86 | + * <p> |
| 87 | + * I believe the first five entries are *mostly* optimizations and making the |
| 88 | + * <a href="https://en.wikipedia.org/wiki/Three-valued_logic">Three-valued logic</a> of SQL |
| 89 | + * explicit and integrated with our multivalue field rules. And make all that work with the |
| 90 | + * actual evaluator code. You could probably shorten this to the last three points, but lots |
| 91 | + * of folks aren't familiar with SQL's three-valued logic anyway, so let's be explicit. |
| 92 | + * </p> |
| 93 | + * <p> |
| 94 | + * Because of this chain of logic we don't use the standard evaluator generators. They'd just |
| 95 | + * require too many special cases and nothing else quite works like this. I mean, everything |
| 96 | + * works just like this in that "three-valued logic" sort of way, but not in the "java code" |
| 97 | + * sort of way. So! Instead of using the standard evaluator generators we use the |
| 98 | + * String Template generators that we use for things like {@link Block} and {@link Vector}. |
| 99 | + * </p> |
| 100 | + */ |
53 | 101 | public class In extends EsqlScalarFunction { |
54 | 102 | public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "In", In::new); |
55 | 103 |
|
|
0 commit comments