Skip to content

Commit f244bd6

Browse files
Add support for median(<value>) (#4234)
* First revision Signed-off-by: Aaron Alvarez <[email protected]> * Fixing documentation Signed-off-by: Aaron Alvarez <[email protected]> * Removing unnecessary comments Signed-off-by: Aaron Alvarez <[email protected]> * Fixinf stats.rst documentation Signed-off-by: Aaron Alvarez <[email protected]> * Fixing documentation Signed-off-by: Aaron Alvarez <[email protected]> * Addressing comments Signed-off-by: Aaron Alvarez <[email protected]> --------- Signed-off-by: Aaron Alvarez <[email protected]> Signed-off-by: Aaron Alvarez <[email protected]> Co-authored-by: Aaron Alvarez <[email protected]>
1 parent ab5f21a commit f244bd6

File tree

8 files changed

+123
-29
lines changed

8 files changed

+123
-29
lines changed

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ public enum BuiltinFunctionName {
198198
TAKE(FunctionName.of("take")),
199199
// t-digest percentile which is used in OpenSearch core by default.
200200
PERCENTILE_APPROX(FunctionName.of("percentile_approx")),
201+
MEDIAN(FunctionName.of("median")),
201202
EARLIEST(FunctionName.of("earliest")),
202203
LATEST(FunctionName.of("latest")),
203204
DISTINCT_COUNT_APPROX(FunctionName.of("distinct_count_approx")),
@@ -352,6 +353,7 @@ public enum BuiltinFunctionName {
352353
.put("take", BuiltinFunctionName.TAKE)
353354
.put("percentile", BuiltinFunctionName.PERCENTILE_APPROX)
354355
.put("percentile_approx", BuiltinFunctionName.PERCENTILE_APPROX)
356+
.put("median", BuiltinFunctionName.MEDIAN)
355357
.put("earliest", BuiltinFunctionName.EARLIEST)
356358
.put("latest", BuiltinFunctionName.LATEST)
357359
.put("distinct_count_approx", BuiltinFunctionName.DISTINCT_COUNT_APPROX)

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@
127127
import static org.opensearch.sql.expression.function.BuiltinFunctionName.MATCH_PHRASE_PREFIX;
128128
import static org.opensearch.sql.expression.function.BuiltinFunctionName.MAX;
129129
import static org.opensearch.sql.expression.function.BuiltinFunctionName.MD5;
130+
import static org.opensearch.sql.expression.function.BuiltinFunctionName.MEDIAN;
130131
import static org.opensearch.sql.expression.function.BuiltinFunctionName.MICROSECOND;
131132
import static org.opensearch.sql.expression.function.BuiltinFunctionName.MIN;
132133
import static org.opensearch.sql.expression.function.BuiltinFunctionName.MINUTE;
@@ -259,6 +260,7 @@
259260
import org.apache.logging.log4j.Logger;
260261
import org.opensearch.sql.calcite.CalcitePlanContext;
261262
import org.opensearch.sql.calcite.utils.OpenSearchTypeFactory;
263+
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
262264
import org.opensearch.sql.calcite.utils.PlanUtils;
263265
import org.opensearch.sql.calcite.utils.UserDefinedFunctionUtils;
264266
import org.opensearch.sql.exception.ExpressionEvaluationException;
@@ -1043,6 +1045,7 @@ void register(
10431045
}
10441046

10451047
private static class AggBuilder {
1048+
private static final double MEDIAN_PERCENTILE = 50.0;
10461049
private final Map<BuiltinFunctionName, Pair<CalciteFuncSignature, AggHandler>> map =
10471050
new HashMap<>();
10481051

@@ -1117,6 +1120,9 @@ void populate() {
11171120
register(
11181121
PERCENTILE_APPROX,
11191122
(distinct, field, argList, ctx) -> {
1123+
if (field.getType() == null) {
1124+
throw new IllegalArgumentException("Field type cannot be null");
1125+
}
11201126
List<RexNode> newArgList =
11211127
argList.stream().map(PlanUtils::derefMapCall).collect(Collectors.toList());
11221128
newArgList.add(ctx.rexBuilder.makeFlag(field.getType().getSqlTypeName()));
@@ -1128,6 +1134,31 @@ void populate() {
11281134
PERCENTILE_APPROX.name(),
11291135
false));
11301136

1137+
register(
1138+
MEDIAN,
1139+
(distinct, field, argList, ctx) -> {
1140+
if (distinct) {
1141+
throw new IllegalArgumentException("MEDIAN does not support DISTINCT");
1142+
}
1143+
if (!argList.isEmpty()) {
1144+
throw new IllegalArgumentException("MEDIAN takes no additional arguments");
1145+
}
1146+
if (field.getType() == null) {
1147+
throw new IllegalArgumentException("Field type cannot be null");
1148+
}
1149+
List<RexNode> medianArgList =
1150+
List.of(
1151+
ctx.rexBuilder.makeExactLiteral(BigDecimal.valueOf(MEDIAN_PERCENTILE)),
1152+
ctx.rexBuilder.makeFlag(field.getType().getSqlTypeName()));
1153+
return UserDefinedFunctionUtils.makeAggregateCall(
1154+
PPLBuiltinOperators.PERCENTILE_APPROX,
1155+
List.of(field),
1156+
medianArgList,
1157+
ctx.relBuilder);
1158+
},
1159+
wrapSqlOperandTypeChecker(
1160+
PPLOperandTypes.NUMERIC.getInnerTypeChecker(), MEDIAN.name(), false));
1161+
11311162
register(
11321163
EARLIEST,
11331164
(distinct, field, argList, ctx) -> {

docs/user/dql/aggregations.rst

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -391,34 +391,6 @@ Example::
391391
| M | 36 |
392392
+--------+-----+
393393

394-
Percentile Shortcut Functions
395-
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
396-
397-
For convenience, OpenSearch PPL provides shortcut functions for common percentiles:
398-
399-
- ``PERC<percent>(expr)`` - Equivalent to ``PERCENTILE(expr, <percent>)``
400-
- ``P<percent>(expr)`` - Equivalent to ``PERCENTILE(expr, <percent>)``
401-
402-
Both integer and decimal percentiles from 0 to 100 are supported (e.g., ``PERC95``, ``P99.5``).
403-
404-
Example::
405-
406-
ppl> source=accounts | stats perc99.5(age);
407-
fetched rows / total rows = 1/1
408-
+---------------+
409-
| perc99.5(age) |
410-
|---------------|
411-
| 36 |
412-
+---------------+
413-
414-
ppl> source=accounts | stats p50(age);
415-
fetched rows / total rows = 1/1
416-
+---------+
417-
| p50(age) |
418-
|---------|
419-
| 32 |
420-
+---------+
421-
422394
HAVING Clause
423395
=============
424396

docs/user/ppl/cmd/stats.rst

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ stats <aggregation>... [by-clause]
5151
* Description: The unit of the interval expression is the natural unit by default. If the field is a date and time type field, and the interval is in date/time units, you will need to specify the unit in the interval expression. For example, to split the field ``age`` into buckets by 10 years, it looks like ``span(age, 10)``. And here is another example of time span, the span to split a ``timestamp`` field into hourly intervals, it looks like ``span(timestamp, 1h)``.
5252

5353
* Available time unit:
54+
5455
+----------------------------+
5556
| Span Interval Units |
5657
+============================+
@@ -273,7 +274,7 @@ Example::
273274
+--------------------+
274275

275276
DISTINCT_COUNT_APPROX
276-
----------
277+
---------------------
277278

278279
Description
279280
>>>>>>>>>>>
@@ -336,6 +337,58 @@ Example::
336337
| 36 | M |
337338
+---------------------+--------+
338339

340+
Percentile Shortcut Functions
341+
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
342+
343+
Version: 3.3.0
344+
345+
For convenience, OpenSearch PPL provides shortcut functions for common percentiles:
346+
347+
- ``PERC<percent>(expr)`` - Equivalent to ``PERCENTILE(expr, <percent>)``
348+
- ``P<percent>(expr)`` - Equivalent to ``PERCENTILE(expr, <percent>)``
349+
350+
Both integer and decimal percentiles from 0 to 100 are supported (e.g., ``PERC95``, ``P99.5``).
351+
352+
Example::
353+
354+
ppl> source=accounts | stats perc99.5(age);
355+
fetched rows / total rows = 1/1
356+
+---------------+
357+
| perc99.5(age) |
358+
|---------------|
359+
| 36 |
360+
+---------------+
361+
362+
ppl> source=accounts | stats p50(age);
363+
fetched rows / total rows = 1/1
364+
+---------+
365+
| p50(age) |
366+
|---------|
367+
| 32 |
368+
+---------+
369+
370+
MEDIAN
371+
------
372+
373+
Description
374+
>>>>>>>>>>>
375+
376+
Version: 3.3.0
377+
378+
Usage: MEDIAN(expr). Returns the median (50th percentile) value of `expr`. This is equivalent to ``PERCENTILE(expr, 50)``.
379+
380+
Note: This function requires Calcite to be enabled (see `Configuration`_ section above).
381+
382+
Example::
383+
384+
os> source=accounts | stats median(age);
385+
fetched rows / total rows = 1/1
386+
+-------------+
387+
| median(age) |
388+
|-------------|
389+
| 33 |
390+
+-------------+
391+
339392
EARLIEST
340393
--------
341394

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLAggregationIT.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,4 +969,12 @@ public void testStatsCountAliasByGroupWithSort() throws IOException {
969969
rows(1, "VA"),
970970
rows(1, "WA"));
971971
}
972+
973+
@Test
974+
public void testMedian() throws IOException {
975+
JSONObject actual =
976+
executeQuery(String.format("source=%s | stats median(balance)", TEST_INDEX_BANK));
977+
verifySchema(actual, schema("median(balance)", "bigint"));
978+
verifyDataRows(actual, rows(32838));
979+
}
972980
}

ppl/src/main/antlr/OpenSearchPPLParser.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@ statsFunctionName
553553
| STDDEV_POP
554554
| PERCENTILE
555555
| PERCENTILE_APPROX
556+
| MEDIAN
556557
| LIST
557558
;
558559

ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLAggregationTest.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,4 +696,19 @@ public void testPercentileShortcutInvalidDecimalValueAbove100() {
696696
String ppl = "source=EMP | stats perc100.1(SAL)";
697697
getRelNode(ppl);
698698
}
699+
700+
@Test
701+
public void testMedian() {
702+
String ppl = "source=EMP | stats median(SAL)";
703+
RelNode root = getRelNode(ppl);
704+
String expectedLogical =
705+
"LogicalAggregate(group=[{}], median(SAL)=[percentile_approx($0, $1, $2)])\n"
706+
+ " LogicalProject(SAL=[$5], $f1=[50.0:DECIMAL(3, 1)], $f2=[FLAG(DECIMAL)])\n"
707+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
708+
verifyLogical(root, expectedLogical);
709+
710+
String expectedSparkSql =
711+
"SELECT `percentile_approx`(`SAL`, 50.0, DECIMAL) `median(SAL)`\n" + "FROM `scott`.`EMP`";
712+
verifyPPLToSparkSQL(root, expectedSparkSql);
713+
}
699714
}

ppl/src/test/java/org/opensearch/sql/ppl/parser/AstExpressionBuilderTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,4 +1298,16 @@ public void testPercentileShortcutFunctionInvalidDecimalValueAbove100() {
12981298
SyntaxCheckException.class,
12991299
() -> assertEqual("source=t | stats perc100.1(a)", (Node) null));
13001300
}
1301+
1302+
@Test
1303+
public void testMedianAggFuncExpr() {
1304+
assertEqual(
1305+
"source=t | stats median(a)",
1306+
agg(
1307+
relation("t"),
1308+
exprList(alias("median(a)", aggregate("median", field("a")))),
1309+
emptyList(),
1310+
emptyList(),
1311+
defaultStatsArgs()));
1312+
}
13011313
}

0 commit comments

Comments
 (0)