Skip to content

Commit 3e64817

Browse files
authored
ESQL: Add CATEGORIZE() check to avoid having multiple groupings (#116660) (#116821)
Added checks to avoid unsupported usages of `CATEGORIZE` grouping function: - Can't be used with other groups - Can't be used within other functions - Can't be used or referenced in the aggregates side
1 parent 8126bf5 commit 3e64817

File tree

2 files changed

+135
-0
lines changed

2 files changed

+135
-0
lines changed

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/analysis/Verifier.java

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.elasticsearch.xpack.esql.core.expression.Expression;
1818
import org.elasticsearch.xpack.esql.core.expression.Expressions;
1919
import org.elasticsearch.xpack.esql.core.expression.FieldAttribute;
20+
import org.elasticsearch.xpack.esql.core.expression.NameId;
2021
import org.elasticsearch.xpack.esql.core.expression.NamedExpression;
2122
import org.elasticsearch.xpack.esql.core.expression.TypeResolutions;
2223
import org.elasticsearch.xpack.esql.core.expression.function.Function;
@@ -33,6 +34,7 @@
3334
import org.elasticsearch.xpack.esql.expression.function.fulltext.FullTextFunction;
3435
import org.elasticsearch.xpack.esql.expression.function.fulltext.Match;
3536
import org.elasticsearch.xpack.esql.expression.function.fulltext.QueryString;
37+
import org.elasticsearch.xpack.esql.expression.function.grouping.Categorize;
3638
import org.elasticsearch.xpack.esql.expression.function.grouping.GroupingFunction;
3739
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Neg;
3840
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
@@ -56,10 +58,12 @@
5658
import java.util.ArrayList;
5759
import java.util.BitSet;
5860
import java.util.Collection;
61+
import java.util.HashMap;
5962
import java.util.HashSet;
6063
import java.util.LinkedHashSet;
6164
import java.util.List;
6265
import java.util.Locale;
66+
import java.util.Map;
6367
import java.util.Set;
6468
import java.util.function.BiConsumer;
6569
import java.util.function.Consumer;
@@ -271,6 +275,7 @@ private static void checkAggregate(LogicalPlan p, Set<Failure> failures) {
271275
r -> failures.add(fail(r, "the rate aggregate[{}] can only be used within the metrics command", r.sourceText()))
272276
);
273277
}
278+
checkCategorizeGrouping(agg, failures);
274279
} else {
275280
p.forEachExpression(
276281
GroupingFunction.class,
@@ -279,6 +284,74 @@ private static void checkAggregate(LogicalPlan p, Set<Failure> failures) {
279284
}
280285
}
281286

287+
/**
288+
* Check CATEGORIZE grouping function usages.
289+
* <p>
290+
* Some of those checks are temporary, until the required syntax or engine changes are implemented.
291+
* </p>
292+
*/
293+
private static void checkCategorizeGrouping(Aggregate agg, Set<Failure> failures) {
294+
// Forbid CATEGORIZE grouping function with other groupings
295+
if (agg.groupings().size() > 1) {
296+
agg.groupings().forEach(g -> {
297+
g.forEachDown(
298+
Categorize.class,
299+
categorize -> failures.add(
300+
fail(categorize, "cannot use CATEGORIZE grouping function [{}] with multiple groupings", categorize.sourceText())
301+
)
302+
);
303+
});
304+
}
305+
306+
// Forbid CATEGORIZE grouping functions not being top level groupings
307+
agg.groupings().forEach(g -> {
308+
// Check all CATEGORIZE but the top level one
309+
Alias.unwrap(g)
310+
.children()
311+
.forEach(
312+
child -> child.forEachDown(
313+
Categorize.class,
314+
c -> failures.add(
315+
fail(c, "CATEGORIZE grouping function [{}] can't be used within other expressions", c.sourceText())
316+
)
317+
)
318+
);
319+
});
320+
321+
// Forbid CATEGORIZE being used in the aggregations
322+
agg.aggregates().forEach(a -> {
323+
a.forEachDown(
324+
Categorize.class,
325+
categorize -> failures.add(
326+
fail(categorize, "cannot use CATEGORIZE grouping function [{}] within the aggregations", categorize.sourceText())
327+
)
328+
);
329+
});
330+
331+
// Forbid CATEGORIZE being referenced in the aggregation functions
332+
Map<NameId, Categorize> categorizeByAliasId = new HashMap<>();
333+
agg.groupings().forEach(g -> {
334+
g.forEachDown(Alias.class, alias -> {
335+
if (alias.child() instanceof Categorize categorize) {
336+
categorizeByAliasId.put(alias.id(), categorize);
337+
}
338+
});
339+
});
340+
agg.aggregates()
341+
.forEach(a -> a.forEachDown(AggregateFunction.class, aggregate -> aggregate.forEachDown(Attribute.class, attribute -> {
342+
var categorize = categorizeByAliasId.get(attribute.id());
343+
if (categorize != null) {
344+
failures.add(
345+
fail(
346+
attribute,
347+
"cannot reference CATEGORIZE grouping function [{}] within the aggregations",
348+
attribute.sourceText()
349+
)
350+
);
351+
}
352+
})));
353+
}
354+
282355
private static void checkRateAggregates(Expression expr, int nestedLevel, Set<Failure> failures) {
283356
if (expr instanceof AggregateFunction) {
284357
nestedLevel++;

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1737,6 +1737,68 @@ public void testIntervalAsString() {
17371737
);
17381738
}
17391739

1740+
public void testCategorizeSingleGrouping() {
1741+
query("from test | STATS COUNT(*) BY CATEGORIZE(first_name)");
1742+
query("from test | STATS COUNT(*) BY cat = CATEGORIZE(first_name)");
1743+
1744+
assertEquals(
1745+
"1:31: cannot use CATEGORIZE grouping function [CATEGORIZE(first_name)] with multiple groupings",
1746+
error("from test | STATS COUNT(*) BY CATEGORIZE(first_name), emp_no")
1747+
);
1748+
assertEquals(
1749+
"1:39: cannot use CATEGORIZE grouping function [CATEGORIZE(first_name)] with multiple groupings",
1750+
error("FROM test | STATS COUNT(*) BY emp_no, CATEGORIZE(first_name)")
1751+
);
1752+
assertEquals(
1753+
"1:35: cannot use CATEGORIZE grouping function [CATEGORIZE(first_name)] with multiple groupings",
1754+
error("FROM test | STATS COUNT(*) BY a = CATEGORIZE(first_name), b = emp_no")
1755+
);
1756+
assertEquals(
1757+
"1:31: cannot use CATEGORIZE grouping function [CATEGORIZE(first_name)] with multiple groupings\n"
1758+
+ "line 1:55: cannot use CATEGORIZE grouping function [CATEGORIZE(last_name)] with multiple groupings",
1759+
error("FROM test | STATS COUNT(*) BY CATEGORIZE(first_name), CATEGORIZE(last_name)")
1760+
);
1761+
assertEquals(
1762+
"1:31: cannot use CATEGORIZE grouping function [CATEGORIZE(first_name)] with multiple groupings",
1763+
error("FROM test | STATS COUNT(*) BY CATEGORIZE(first_name), CATEGORIZE(first_name)")
1764+
);
1765+
}
1766+
1767+
public void testCategorizeNestedGrouping() {
1768+
query("from test | STATS COUNT(*) BY CATEGORIZE(LENGTH(first_name)::string)");
1769+
1770+
assertEquals(
1771+
"1:40: CATEGORIZE grouping function [CATEGORIZE(first_name)] can't be used within other expressions",
1772+
error("FROM test | STATS COUNT(*) BY MV_COUNT(CATEGORIZE(first_name))")
1773+
);
1774+
assertEquals(
1775+
"1:31: CATEGORIZE grouping function [CATEGORIZE(first_name)] can't be used within other expressions",
1776+
error("FROM test | STATS COUNT(*) BY CATEGORIZE(first_name)::datetime")
1777+
);
1778+
}
1779+
1780+
public void testCategorizeWithinAggregations() {
1781+
query("from test | STATS MV_COUNT(cat), COUNT(*) BY cat = CATEGORIZE(first_name)");
1782+
1783+
assertEquals(
1784+
"1:25: cannot use CATEGORIZE grouping function [CATEGORIZE(first_name)] within the aggregations",
1785+
error("FROM test | STATS COUNT(CATEGORIZE(first_name)) BY CATEGORIZE(first_name)")
1786+
);
1787+
1788+
assertEquals(
1789+
"1:25: cannot reference CATEGORIZE grouping function [cat] within the aggregations",
1790+
error("FROM test | STATS COUNT(cat) BY cat = CATEGORIZE(first_name)")
1791+
);
1792+
assertEquals(
1793+
"1:30: cannot reference CATEGORIZE grouping function [cat] within the aggregations",
1794+
error("FROM test | STATS SUM(LENGTH(cat::keyword) + LENGTH(last_name)) BY cat = CATEGORIZE(first_name)")
1795+
);
1796+
assertEquals(
1797+
"1:25: cannot reference CATEGORIZE grouping function [`CATEGORIZE(first_name)`] within the aggregations",
1798+
error("FROM test | STATS COUNT(`CATEGORIZE(first_name)`) BY CATEGORIZE(first_name)")
1799+
);
1800+
}
1801+
17401802
private void query(String query) {
17411803
defaultAnalyzer.analyze(parser.createStatement(query));
17421804
}

0 commit comments

Comments
 (0)