Skip to content

Commit 2753fa8

Browse files
authored
ESQL: Keep ordinals in conversion functions (#125357) (#127933)
Make the conversion functions that process `BytesRef`s into `BytesRefs` keep the `OrdinalBytesRefVector`s when processing. Let's use `TO_LOWER` as an example. First, the performance numbers: ``` (operation) Mode Score Error -> Score Error Units to_lower 30.662 ± 6.163 -> 30.048 ± 0.479 ns/op to_lower_ords 30.773 ± 0.370 -> 0.025 ± 0.001 ns/op to_upper 33.552 ± 0.529 -> 35.775 ± 1.799 ns/op to_upper_ords 35.791 ± 0.658 -> 0.027 ± 0.001 ns/op ``` The test has a 8192 positions containing alternating `foo` and `bar`. Running `TO_LOWER` via ordinals is super duper faster. No longer `O(positions)` and now `O(unique_values)`. Let's paint some pictures! `OrdinalBytesRefVector` is a lookup table. Like this: ``` +-------+----------+ | bytes | ordinals | | ----- | -------- | | FOO | 0 | | BAR | 1 | | BAZ | 2 | +-------+ 1 | | 1 | | 0 | +----------+ ``` That lookup table is one block. When you read it you look up the `ordinal` and match it to the `bytes`. Previously `TO_LOWER` would process each value one at a time and make: ``` bytes ----- foo bar baz bar bar foo ``` So it'd run `TO_LOWER` once per `ordinal` and it'd make an ordinal non-lookup table. With this change `TO_LOWER` will now make: ``` +-------+----------+ | bytes | ordinals | | ----- | -------- | | foo | 0 | | bar | 1 | | baz | 2 | +-------+ 1 | | 1 | | 0 | +----------+ ``` We don't even have to copy the `ordinals` - we can reuse those from the input and just bump the reference count. That's why this goes from `O(positions)` to `O(unique_values)`.
1 parent e9174bd commit 2753fa8

File tree

17 files changed

+417
-10
lines changed

17 files changed

+417
-10
lines changed

benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/EvalBenchmark.java

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,19 @@
1111

1212
import org.apache.lucene.util.BytesRef;
1313
import org.elasticsearch.common.breaker.NoopCircuitBreaker;
14+
import org.elasticsearch.common.settings.Settings;
1415
import org.elasticsearch.common.util.BigArrays;
1516
import org.elasticsearch.compute.data.Block;
1617
import org.elasticsearch.compute.data.BlockFactory;
1718
import org.elasticsearch.compute.data.BooleanBlock;
1819
import org.elasticsearch.compute.data.BooleanVector;
20+
import org.elasticsearch.compute.data.BytesRefBlock;
21+
import org.elasticsearch.compute.data.BytesRefVector;
1922
import org.elasticsearch.compute.data.DoubleBlock;
2023
import org.elasticsearch.compute.data.DoubleVector;
2124
import org.elasticsearch.compute.data.LongBlock;
2225
import org.elasticsearch.compute.data.LongVector;
26+
import org.elasticsearch.compute.data.OrdinalBytesRefVector;
2327
import org.elasticsearch.compute.data.Page;
2428
import org.elasticsearch.compute.operator.DriverContext;
2529
import org.elasticsearch.compute.operator.EvalOperator;
@@ -40,9 +44,13 @@
4044
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
4145
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
4246
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
47+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToLower;
48+
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToUpper;
4349
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Add;
4450
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
4551
import org.elasticsearch.xpack.esql.planner.Layout;
52+
import org.elasticsearch.xpack.esql.plugin.EsqlPlugin;
53+
import org.elasticsearch.xpack.esql.session.Configuration;
4654
import org.openjdk.jmh.annotations.Benchmark;
4755
import org.openjdk.jmh.annotations.BenchmarkMode;
4856
import org.openjdk.jmh.annotations.Fork;
@@ -56,8 +64,10 @@
5664
import org.openjdk.jmh.annotations.Warmup;
5765

5866
import java.time.Duration;
67+
import java.time.ZoneOffset;
5968
import java.util.Arrays;
6069
import java.util.List;
70+
import java.util.Locale;
6171
import java.util.Map;
6272
import java.util.concurrent.TimeUnit;
6373

@@ -106,7 +116,11 @@ public class EvalBenchmark {
106116
"long_equal_to_int",
107117
"mv_min",
108118
"mv_min_ascending",
109-
"rlike" }
119+
"rlike",
120+
"to_lower",
121+
"to_lower_ords",
122+
"to_upper",
123+
"to_upper_ords" }
110124
)
111125
public String operation;
112126

@@ -214,6 +228,16 @@ private static EvalOperator.ExpressionEvaluator evaluator(String operation) {
214228
RLike rlike = new RLike(Source.EMPTY, keywordField, new RLikePattern(".ar"));
215229
yield EvalMapper.toEvaluator(FOLD_CONTEXT, rlike, layout(keywordField)).get(driverContext);
216230
}
231+
case "to_lower", "to_lower_ords" -> {
232+
FieldAttribute keywordField = keywordField();
233+
ToLower toLower = new ToLower(Source.EMPTY, keywordField, configuration());
234+
yield EvalMapper.toEvaluator(FOLD_CONTEXT, toLower, layout(keywordField)).get(driverContext);
235+
}
236+
case "to_upper", "to_upper_ords" -> {
237+
FieldAttribute keywordField = keywordField();
238+
ToUpper toUpper = new ToUpper(Source.EMPTY, keywordField, configuration());
239+
yield EvalMapper.toEvaluator(FOLD_CONTEXT, toUpper, layout(keywordField)).get(driverContext);
240+
}
217241
default -> throw new UnsupportedOperationException();
218242
};
219243
}
@@ -366,10 +390,35 @@ private static void checkExpected(String operation, Page actual) {
366390
}
367391
}
368392
}
393+
case "to_lower" -> checkBytes(operation, actual, false, new BytesRef[] { new BytesRef("foo"), new BytesRef("bar") });
394+
case "to_lower_ords" -> checkBytes(operation, actual, true, new BytesRef[] { new BytesRef("foo"), new BytesRef("bar") });
395+
case "to_upper" -> checkBytes(operation, actual, false, new BytesRef[] { new BytesRef("FOO"), new BytesRef("BAR") });
396+
case "to_upper_ords" -> checkBytes(operation, actual, true, new BytesRef[] { new BytesRef("FOO"), new BytesRef("BAR") });
369397
default -> throw new UnsupportedOperationException(operation);
370398
}
371399
}
372400

401+
private static void checkBytes(String operation, Page actual, boolean expectOrds, BytesRef[] expectedVals) {
402+
BytesRef scratch = new BytesRef();
403+
BytesRefVector v = actual.<BytesRefBlock>getBlock(1).asVector();
404+
for (int i = 0; i < BLOCK_LENGTH; i++) {
405+
BytesRef expected = expectedVals[i % 2];
406+
BytesRef b = v.getBytesRef(i, scratch);
407+
if (b.equals(expected) == false) {
408+
throw new AssertionError("[" + operation + "] expected [" + expected + "] but was [" + b + "]");
409+
}
410+
}
411+
if (expectOrds) {
412+
if (v.asOrdinals() == null) {
413+
throw new IllegalArgumentException("expected ords but got " + v);
414+
}
415+
} else {
416+
if (v.asOrdinals() != null) {
417+
throw new IllegalArgumentException("expected non-ords but got " + v);
418+
}
419+
}
420+
}
421+
373422
private static Page page(String operation) {
374423
return switch (operation) {
375424
case "abs", "add", "date_trunc", "equal_to_const" -> {
@@ -448,6 +497,16 @@ private static Page page(String operation) {
448497
}
449498
yield new Page(builder.build().asBlock());
450499
}
500+
case "to_lower_ords", "to_upper_ords" -> {
501+
var bytes = blockFactory.newBytesRefVectorBuilder(BLOCK_LENGTH);
502+
bytes.appendBytesRef(new BytesRef("foo"));
503+
bytes.appendBytesRef(new BytesRef("bar"));
504+
var ordinals = blockFactory.newIntVectorFixedBuilder(BLOCK_LENGTH);
505+
for (int i = 0; i < BLOCK_LENGTH; i++) {
506+
ordinals.appendInt(i % 2);
507+
}
508+
yield new Page(new OrdinalBytesRefVector(ordinals.build(), bytes.build()).asBlock());
509+
}
451510
default -> throw new UnsupportedOperationException();
452511
};
453512
}
@@ -470,4 +529,21 @@ private static void run(String operation) {
470529
checkExpected(operation, output);
471530
}
472531
}
532+
533+
private static Configuration configuration() {
534+
return new Configuration(
535+
ZoneOffset.UTC,
536+
Locale.ROOT,
537+
null,
538+
null,
539+
null,
540+
EsqlPlugin.QUERY_RESULT_TRUNCATION_MAX_SIZE.get(Settings.EMPTY),
541+
EsqlPlugin.QUERY_RESULT_TRUNCATION_DEFAULT_SIZE.get(Settings.EMPTY),
542+
null,
543+
false,
544+
Map.of(),
545+
0,
546+
false
547+
);
548+
}
473549
}

docs/changelog/125357.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 125357
2+
summary: Keep ordinals in conversion functions
3+
area: ES|QL
4+
type: enhancement
5+
issues: []

x-pack/plugin/esql/compute/gen/src/main/java/org/elasticsearch/compute/gen/ConvertEvaluatorImplementer.java

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828
import static org.elasticsearch.compute.gen.Types.ABSTRACT_CONVERT_FUNCTION_EVALUATOR;
2929
import static org.elasticsearch.compute.gen.Types.BLOCK;
3030
import static org.elasticsearch.compute.gen.Types.BYTES_REF;
31+
import static org.elasticsearch.compute.gen.Types.BYTES_REF_VECTOR_BUILDER;
3132
import static org.elasticsearch.compute.gen.Types.DRIVER_CONTEXT;
3233
import static org.elasticsearch.compute.gen.Types.EXPRESSION_EVALUATOR;
3334
import static org.elasticsearch.compute.gen.Types.EXPRESSION_EVALUATOR_FACTORY;
35+
import static org.elasticsearch.compute.gen.Types.INT_VECTOR;
36+
import static org.elasticsearch.compute.gen.Types.ORDINALS_BYTES_REF_VECTOR;
3437
import static org.elasticsearch.compute.gen.Types.SOURCE;
3538
import static org.elasticsearch.compute.gen.Types.VECTOR;
3639
import static org.elasticsearch.compute.gen.Types.blockType;
@@ -41,7 +44,7 @@ public class ConvertEvaluatorImplementer {
4144

4245
private final TypeElement declarationType;
4346
private final EvaluatorImplementer.ProcessFunction processFunction;
44-
private final String extraName;
47+
private final boolean canProcessOrdinals;
4548
private final ClassName implementation;
4649
private final TypeName argumentType;
4750
private final List<TypeMirror> warnExceptions;
@@ -55,6 +58,10 @@ public ConvertEvaluatorImplementer(
5558
) {
5659
this.declarationType = (TypeElement) processFunction.getEnclosingElement();
5760
this.processFunction = new EvaluatorImplementer.ProcessFunction(types, processFunction, warnExceptions);
61+
this.canProcessOrdinals = warnExceptions.isEmpty()
62+
&& this.processFunction.returnType().equals(BYTES_REF)
63+
&& this.processFunction.args.get(0) instanceof EvaluatorImplementer.StandardProcessFunctionArg s
64+
&& s.type().equals(BYTES_REF);
5865

5966
if (this.processFunction.args.get(0) instanceof EvaluatorImplementer.StandardProcessFunctionArg == false) {
6067
throw new IllegalArgumentException("first argument must be the field to process");
@@ -66,7 +73,6 @@ public ConvertEvaluatorImplementer(
6673
}
6774
}
6875

69-
this.extraName = extraName;
7076
this.argumentType = TypeName.get(processFunction.getParameters().get(0).asType());
7177
this.warnExceptions = warnExceptions;
7278

@@ -102,6 +108,9 @@ private TypeSpec type() {
102108
builder.addMethod(evalValue(true));
103109
builder.addMethod(evalBlock());
104110
builder.addMethod(evalValue(false));
111+
if (canProcessOrdinals) {
112+
builder.addMethod(evalOrdinals());
113+
}
105114
builder.addMethod(processFunction.toStringMethod(implementation));
106115
builder.addMethod(processFunction.close());
107116
builder.addType(factory());
@@ -132,6 +141,15 @@ private MethodSpec evalVector() {
132141

133142
TypeName vectorType = vectorType(argumentType);
134143
builder.addStatement("$T vector = ($T) v", vectorType, vectorType);
144+
if (canProcessOrdinals) {
145+
builder.addStatement("$T ordinals = vector.asOrdinals()", ORDINALS_BYTES_REF_VECTOR);
146+
builder.beginControlFlow("if (ordinals != null)");
147+
{
148+
builder.addStatement("return evalOrdinals(ordinals)");
149+
}
150+
builder.endControlFlow();
151+
}
152+
135153
builder.addStatement("int positionCount = v.getPositionCount()");
136154

137155
String scratchPadName = argumentType.equals(BYTES_REF) ? "scratchPad" : null;
@@ -299,6 +317,31 @@ private MethodSpec evalValue(boolean forVector) {
299317
return builder.build();
300318
}
301319

320+
private MethodSpec evalOrdinals() {
321+
MethodSpec.Builder builder = MethodSpec.methodBuilder("evalOrdinals").addModifiers(Modifier.PRIVATE);
322+
builder.addParameter(ORDINALS_BYTES_REF_VECTOR, "v").returns(BLOCK);
323+
324+
builder.addStatement("int positionCount = v.getDictionaryVector().getPositionCount()");
325+
builder.addStatement("BytesRef scratchPad = new BytesRef()");
326+
builder.beginControlFlow(
327+
"try ($T builder = driverContext.blockFactory().newBytesRefVectorBuilder(positionCount))",
328+
BYTES_REF_VECTOR_BUILDER
329+
);
330+
{
331+
builder.beginControlFlow("for (int p = 0; p < positionCount; p++)");
332+
{
333+
builder.addStatement("builder.appendBytesRef($N)", evalValueCall("v.getDictionaryVector()", "p", "scratchPad"));
334+
}
335+
builder.endControlFlow();
336+
builder.addStatement("$T ordinals = v.getOrdinalsVector()", INT_VECTOR);
337+
builder.addStatement("ordinals.incRef()");
338+
builder.addStatement("return new $T(ordinals, builder.build()).asBlock()", ORDINALS_BYTES_REF_VECTOR);
339+
}
340+
builder.endControlFlow();
341+
342+
return builder.build();
343+
}
344+
302345
private TypeSpec factory() {
303346
TypeSpec.Builder builder = TypeSpec.classBuilder("Factory");
304347
builder.addSuperinterface(EXPRESSION_EVALUATOR_FACTORY);

x-pack/plugin/esql/compute/gen/src/main/java/org/elasticsearch/compute/gen/Types.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ public class Types {
6161

6262
static final ClassName BOOLEAN_VECTOR = ClassName.get(DATA_PACKAGE, "BooleanVector");
6363
static final ClassName BYTES_REF_VECTOR = ClassName.get(DATA_PACKAGE, "BytesRefVector");
64+
static final ClassName ORDINALS_BYTES_REF_VECTOR = ClassName.get(DATA_PACKAGE, "OrdinalBytesRefVector");
6465
static final ClassName INT_VECTOR = ClassName.get(DATA_PACKAGE, "IntVector");
6566
static final ClassName LONG_VECTOR = ClassName.get(DATA_PACKAGE, "LongVector");
6667
static final ClassName DOUBLE_VECTOR = ClassName.get(DATA_PACKAGE, "DoubleVector");

x-pack/plugin/esql/compute/test/src/main/java/org/elasticsearch/compute/test/BlockTestUtils.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,23 @@
1313
import org.elasticsearch.compute.data.BlockUtils;
1414
import org.elasticsearch.compute.data.BooleanBlock;
1515
import org.elasticsearch.compute.data.BytesRefBlock;
16+
import org.elasticsearch.compute.data.BytesRefVector;
1617
import org.elasticsearch.compute.data.DocBlock;
1718
import org.elasticsearch.compute.data.DoubleBlock;
1819
import org.elasticsearch.compute.data.ElementType;
1920
import org.elasticsearch.compute.data.FloatBlock;
2021
import org.elasticsearch.compute.data.IntBlock;
2122
import org.elasticsearch.compute.data.LongBlock;
23+
import org.elasticsearch.compute.data.OrdinalBytesRefBlock;
2224
import org.elasticsearch.compute.data.Page;
25+
import org.elasticsearch.core.Releasables;
2326
import org.hamcrest.Matcher;
2427

2528
import java.util.ArrayList;
29+
import java.util.Arrays;
30+
import java.util.HashMap;
2631
import java.util.List;
32+
import java.util.Map;
2733

2834
import static org.elasticsearch.compute.data.BlockUtils.toJavaObject;
2935
import static org.elasticsearch.test.ESTestCase.between;
@@ -268,4 +274,67 @@ public static List<List<Object>> valuesAtPositions(Block block, int from, int to
268274
}
269275
return result;
270276
}
277+
278+
/**
279+
* Convert all of the {@link Block}s in a page that contain {@link BytesRef}s into
280+
* {@link OrdinalBytesRefBlock}s.
281+
*/
282+
public static Page convertBytesRefsToOrdinals(Page page) {
283+
Block[] blocks = new Block[page.getBlockCount()];
284+
try {
285+
for (int b = 0; b < page.getBlockCount(); b++) {
286+
Block block = page.getBlock(b);
287+
if (block.elementType() != ElementType.BYTES_REF) {
288+
blocks[b] = block;
289+
continue;
290+
}
291+
Map<BytesRef, Integer> dedupe = new HashMap<>();
292+
BytesRefBlock bytesRefBlock = (BytesRefBlock) block;
293+
try (
294+
IntBlock.Builder ordinals = block.blockFactory().newIntBlockBuilder(block.getPositionCount());
295+
BytesRefVector.Builder bytes = block.blockFactory().newBytesRefVectorBuilder(block.getPositionCount())
296+
) {
297+
BytesRef scratch = new BytesRef();
298+
for (int p = 0; p < block.getPositionCount(); p++) {
299+
int first = block.getFirstValueIndex(p);
300+
int count = block.getValueCount(p);
301+
if (count == 0) {
302+
ordinals.appendNull();
303+
continue;
304+
}
305+
if (count == 1) {
306+
BytesRef v = bytesRefBlock.getBytesRef(first, scratch);
307+
ordinals.appendInt(dedupe(dedupe, bytes, v));
308+
continue;
309+
}
310+
int end = first + count;
311+
ordinals.beginPositionEntry();
312+
for (int i = first; i < end; i++) {
313+
BytesRef v = bytesRefBlock.getBytesRef(i, scratch);
314+
ordinals.appendInt(dedupe(dedupe, bytes, v));
315+
}
316+
ordinals.endPositionEntry();
317+
}
318+
blocks[b] = new OrdinalBytesRefBlock(ordinals.build(), bytes.build());
319+
bytesRefBlock.decRef();
320+
}
321+
}
322+
Page p = new Page(blocks);
323+
Arrays.fill(blocks, null);
324+
return p;
325+
} finally {
326+
Releasables.close(blocks);
327+
}
328+
}
329+
330+
private static int dedupe(Map<BytesRef, Integer> dedupe, BytesRefVector.Builder bytes, BytesRef v) {
331+
Integer current = dedupe.get(v);
332+
if (current != null) {
333+
return current;
334+
}
335+
bytes.appendBytesRef(v);
336+
int o = dedupe.size();
337+
dedupe.put(v, o);
338+
return o;
339+
}
271340
}

x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/convert/ToStringFromCartesianPointEvaluator.java

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)