Skip to content

Commit c5e7684

Browse files
authored
ESQL: Keep ordinals in conversion functions (#125357)
Make the conversion functions that process `BytesRef`s into `BytesRefs` keep the `OrdinalBytesRefVector`s when processing. Let's use `TO_LOWER` as an example. First, the performance numbers: ``` (operation) Mode Score Error -> Score Error Units to_lower 30.662 ± 6.163 -> 30.048 ± 0.479 ns/op to_lower_ords 30.773 ± 0.370 -> 0.025 ± 0.001 ns/op to_upper 33.552 ± 0.529 -> 35.775 ± 1.799 ns/op to_upper_ords 35.791 ± 0.658 -> 0.027 ± 0.001 ns/op ``` The test has a 8192 positions containing alternating `foo` and `bar`. Running `TO_LOWER` via ordinals is super duper faster. No longer `O(positions)` and now `O(unique_values)`. Let's paint some pictures! `OrdinalBytesRefVector` is a lookup table. Like this: ``` +-------+----------+ | bytes | ordinals | | ----- | -------- | | FOO | 0 | | BAR | 1 | | BAZ | 2 | +-------+ 1 | | 1 | | 0 | +----------+ ``` That lookup table is one block. When you read it you look up the `ordinal` and match it to the `bytes`. Previously `TO_LOWER` would process each value one at a time and make: ``` bytes ----- foo bar baz bar bar foo ``` So it'd run `TO_LOWER` once per `ordinal` and it'd make an ordinal non-lookup table. With this change `TO_LOWER` will now make: ``` +-------+----------+ | bytes | ordinals | | ----- | -------- | | foo | 0 | | bar | 1 | | baz | 2 | +-------+ 1 | | 1 | | 0 | +----------+ ``` We don't even have to copy the `ordinals` - we can reuse those from the input and just bump the reference count. That's why this goes from `O(positions)` to `O(unique_values)`.
1 parent 0930a75 commit c5e7684

File tree

17 files changed

+370
-15
lines changed

17 files changed

+370
-15
lines changed

benchmarks/src/main/java/org/elasticsearch/benchmark/compute/operator/EvalBenchmark.java

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.compute.data.DoubleVector;
2525
import org.elasticsearch.compute.data.LongBlock;
2626
import org.elasticsearch.compute.data.LongVector;
27+
import org.elasticsearch.compute.data.OrdinalBytesRefVector;
2728
import org.elasticsearch.compute.data.Page;
2829
import org.elasticsearch.compute.operator.DriverContext;
2930
import org.elasticsearch.compute.operator.EvalOperator;
@@ -127,7 +128,9 @@ static void selfTest() {
127128
"mv_min_ascending",
128129
"rlike",
129130
"to_lower",
130-
"to_upper" }
131+
"to_lower_ords",
132+
"to_upper",
133+
"to_upper_ords" }
131134
)
132135
public String operation;
133136

@@ -235,12 +238,12 @@ private static EvalOperator.ExpressionEvaluator evaluator(String operation) {
235238
RLike rlike = new RLike(Source.EMPTY, keywordField, new RLikePattern(".ar"));
236239
yield EvalMapper.toEvaluator(FOLD_CONTEXT, rlike, layout(keywordField)).get(driverContext);
237240
}
238-
case "to_lower" -> {
241+
case "to_lower", "to_lower_ords" -> {
239242
FieldAttribute keywordField = keywordField();
240243
ToLower toLower = new ToLower(Source.EMPTY, keywordField, configuration());
241244
yield EvalMapper.toEvaluator(FOLD_CONTEXT, toLower, layout(keywordField)).get(driverContext);
242245
}
243-
case "to_upper" -> {
246+
case "to_upper", "to_upper_ords" -> {
244247
FieldAttribute keywordField = keywordField();
245248
ToUpper toUpper = new ToUpper(Source.EMPTY, keywordField, configuration());
246249
yield EvalMapper.toEvaluator(FOLD_CONTEXT, toUpper, layout(keywordField)).get(driverContext);
@@ -414,13 +417,15 @@ private static void checkExpected(String operation, Page actual) {
414417
}
415418
}
416419
}
417-
case "to_lower" -> checkBytes(operation, actual, new BytesRef[] { new BytesRef("foo"), new BytesRef("bar") });
418-
case "to_upper" -> checkBytes(operation, actual, new BytesRef[] { new BytesRef("FOO"), new BytesRef("BAR") });
420+
case "to_lower" -> checkBytes(operation, actual, false, new BytesRef[] { new BytesRef("foo"), new BytesRef("bar") });
421+
case "to_lower_ords" -> checkBytes(operation, actual, true, new BytesRef[] { new BytesRef("foo"), new BytesRef("bar") });
422+
case "to_upper" -> checkBytes(operation, actual, false, new BytesRef[] { new BytesRef("FOO"), new BytesRef("BAR") });
423+
case "to_upper_ords" -> checkBytes(operation, actual, true, new BytesRef[] { new BytesRef("FOO"), new BytesRef("BAR") });
419424
default -> throw new UnsupportedOperationException(operation);
420425
}
421426
}
422427

423-
private static void checkBytes(String operation, Page actual, BytesRef[] expectedVals) {
428+
private static void checkBytes(String operation, Page actual, boolean expectOrds, BytesRef[] expectedVals) {
424429
BytesRef scratch = new BytesRef();
425430
BytesRefVector v = actual.<BytesRefBlock>getBlock(1).asVector();
426431
for (int i = 0; i < BLOCK_LENGTH; i++) {
@@ -430,6 +435,15 @@ private static void checkBytes(String operation, Page actual, BytesRef[] expecte
430435
throw new AssertionError("[" + operation + "] expected [" + expected + "] but was [" + b + "]");
431436
}
432437
}
438+
if (expectOrds) {
439+
if (v.asOrdinals() == null) {
440+
throw new IllegalArgumentException("expected ords but got " + v);
441+
}
442+
} else {
443+
if (v.asOrdinals() != null) {
444+
throw new IllegalArgumentException("expected non-ords but got " + v);
445+
}
446+
}
433447
}
434448

435449
private static Page page(String operation) {
@@ -510,6 +524,16 @@ private static Page page(String operation) {
510524
}
511525
yield new Page(builder.build().asBlock());
512526
}
527+
case "to_lower_ords", "to_upper_ords" -> {
528+
var bytes = blockFactory.newBytesRefVectorBuilder(BLOCK_LENGTH);
529+
bytes.appendBytesRef(new BytesRef("foo"));
530+
bytes.appendBytesRef(new BytesRef("bar"));
531+
var ordinals = blockFactory.newIntVectorFixedBuilder(BLOCK_LENGTH);
532+
for (int i = 0; i < BLOCK_LENGTH; i++) {
533+
ordinals.appendInt(i % 2);
534+
}
535+
yield new Page(new OrdinalBytesRefVector(ordinals.build(), bytes.build()).asBlock());
536+
}
513537
default -> throw new UnsupportedOperationException();
514538
};
515539
}

docs/changelog/125357.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 125357
2+
summary: Keep ordinals in conversion functions
3+
area: ES|QL
4+
type: enhancement
5+
issues: []

x-pack/plugin/esql/compute/gen/src/main/java/org/elasticsearch/compute/gen/ConvertEvaluatorImplementer.java

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828
import static org.elasticsearch.compute.gen.Types.ABSTRACT_CONVERT_FUNCTION_EVALUATOR;
2929
import static org.elasticsearch.compute.gen.Types.BLOCK;
3030
import static org.elasticsearch.compute.gen.Types.BYTES_REF;
31+
import static org.elasticsearch.compute.gen.Types.BYTES_REF_VECTOR_BUILDER;
3132
import static org.elasticsearch.compute.gen.Types.DRIVER_CONTEXT;
3233
import static org.elasticsearch.compute.gen.Types.EXPRESSION_EVALUATOR;
3334
import static org.elasticsearch.compute.gen.Types.EXPRESSION_EVALUATOR_FACTORY;
35+
import static org.elasticsearch.compute.gen.Types.INT_VECTOR;
36+
import static org.elasticsearch.compute.gen.Types.ORDINALS_BYTES_REF_VECTOR;
3437
import static org.elasticsearch.compute.gen.Types.SOURCE;
3538
import static org.elasticsearch.compute.gen.Types.VECTOR;
3639
import static org.elasticsearch.compute.gen.Types.blockType;
@@ -41,7 +44,7 @@ public class ConvertEvaluatorImplementer {
4144

4245
private final TypeElement declarationType;
4346
private final EvaluatorImplementer.ProcessFunction processFunction;
44-
private final String extraName;
47+
private final boolean canProcessOrdinals;
4548
private final ClassName implementation;
4649
private final TypeName argumentType;
4750
private final List<TypeMirror> warnExceptions;
@@ -55,6 +58,10 @@ public ConvertEvaluatorImplementer(
5558
) {
5659
this.declarationType = (TypeElement) processFunction.getEnclosingElement();
5760
this.processFunction = new EvaluatorImplementer.ProcessFunction(types, processFunction, warnExceptions);
61+
this.canProcessOrdinals = warnExceptions.isEmpty()
62+
&& this.processFunction.returnType().equals(BYTES_REF)
63+
&& this.processFunction.args.getFirst() instanceof EvaluatorImplementer.StandardProcessFunctionArg s
64+
&& s.type().equals(BYTES_REF);
5865

5966
if (this.processFunction.args.getFirst() instanceof EvaluatorImplementer.StandardProcessFunctionArg == false) {
6067
throw new IllegalArgumentException("first argument must be the field to process");
@@ -66,7 +73,6 @@ public ConvertEvaluatorImplementer(
6673
}
6774
}
6875

69-
this.extraName = extraName;
7076
this.argumentType = TypeName.get(processFunction.getParameters().get(0).asType());
7177
this.warnExceptions = warnExceptions;
7278

@@ -102,6 +108,9 @@ private TypeSpec type() {
102108
builder.addMethod(evalValue(true));
103109
builder.addMethod(evalBlock());
104110
builder.addMethod(evalValue(false));
111+
if (canProcessOrdinals) {
112+
builder.addMethod(evalOrdinals());
113+
}
105114
builder.addMethod(processFunction.toStringMethod(implementation));
106115
builder.addMethod(processFunction.close());
107116
builder.addType(factory());
@@ -132,6 +141,15 @@ private MethodSpec evalVector() {
132141

133142
TypeName vectorType = vectorType(argumentType);
134143
builder.addStatement("$T vector = ($T) v", vectorType, vectorType);
144+
if (canProcessOrdinals) {
145+
builder.addStatement("$T ordinals = vector.asOrdinals()", ORDINALS_BYTES_REF_VECTOR);
146+
builder.beginControlFlow("if (ordinals != null)");
147+
{
148+
builder.addStatement("return evalOrdinals(ordinals)");
149+
}
150+
builder.endControlFlow();
151+
}
152+
135153
builder.addStatement("int positionCount = v.getPositionCount()");
136154

137155
String scratchPadName = argumentType.equals(BYTES_REF) ? "scratchPad" : null;
@@ -299,6 +317,31 @@ private MethodSpec evalValue(boolean forVector) {
299317
return builder.build();
300318
}
301319

320+
private MethodSpec evalOrdinals() {
321+
MethodSpec.Builder builder = MethodSpec.methodBuilder("evalOrdinals").addModifiers(Modifier.PRIVATE);
322+
builder.addParameter(ORDINALS_BYTES_REF_VECTOR, "v").returns(BLOCK);
323+
324+
builder.addStatement("int positionCount = v.getDictionaryVector().getPositionCount()");
325+
builder.addStatement("BytesRef scratchPad = new BytesRef()");
326+
builder.beginControlFlow(
327+
"try ($T builder = driverContext.blockFactory().newBytesRefVectorBuilder(positionCount))",
328+
BYTES_REF_VECTOR_BUILDER
329+
);
330+
{
331+
builder.beginControlFlow("for (int p = 0; p < positionCount; p++)");
332+
{
333+
builder.addStatement("builder.appendBytesRef($N)", evalValueCall("v.getDictionaryVector()", "p", "scratchPad"));
334+
}
335+
builder.endControlFlow();
336+
builder.addStatement("$T ordinals = v.getOrdinalsVector()", INT_VECTOR);
337+
builder.addStatement("ordinals.incRef()");
338+
builder.addStatement("return new $T(ordinals, builder.build()).asBlock()", ORDINALS_BYTES_REF_VECTOR);
339+
}
340+
builder.endControlFlow();
341+
342+
return builder.build();
343+
}
344+
302345
private TypeSpec factory() {
303346
TypeSpec.Builder builder = TypeSpec.classBuilder("Factory");
304347
builder.addSuperinterface(EXPRESSION_EVALUATOR_FACTORY);

x-pack/plugin/esql/compute/gen/src/main/java/org/elasticsearch/compute/gen/Types.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ public class Types {
6161

6262
static final ClassName BOOLEAN_VECTOR = ClassName.get(DATA_PACKAGE, "BooleanVector");
6363
static final ClassName BYTES_REF_VECTOR = ClassName.get(DATA_PACKAGE, "BytesRefVector");
64+
static final ClassName ORDINALS_BYTES_REF_VECTOR = ClassName.get(DATA_PACKAGE, "OrdinalBytesRefVector");
6465
static final ClassName INT_VECTOR = ClassName.get(DATA_PACKAGE, "IntVector");
6566
static final ClassName LONG_VECTOR = ClassName.get(DATA_PACKAGE, "LongVector");
6667
static final ClassName DOUBLE_VECTOR = ClassName.get(DATA_PACKAGE, "DoubleVector");

x-pack/plugin/esql/compute/test/src/main/java/org/elasticsearch/compute/test/BlockTestUtils.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,23 @@
1313
import org.elasticsearch.compute.data.BlockUtils;
1414
import org.elasticsearch.compute.data.BooleanBlock;
1515
import org.elasticsearch.compute.data.BytesRefBlock;
16+
import org.elasticsearch.compute.data.BytesRefVector;
1617
import org.elasticsearch.compute.data.DocBlock;
1718
import org.elasticsearch.compute.data.DoubleBlock;
1819
import org.elasticsearch.compute.data.ElementType;
1920
import org.elasticsearch.compute.data.FloatBlock;
2021
import org.elasticsearch.compute.data.IntBlock;
2122
import org.elasticsearch.compute.data.LongBlock;
23+
import org.elasticsearch.compute.data.OrdinalBytesRefBlock;
2224
import org.elasticsearch.compute.data.Page;
25+
import org.elasticsearch.core.Releasables;
2326
import org.hamcrest.Matcher;
2427

2528
import java.util.ArrayList;
29+
import java.util.Arrays;
30+
import java.util.HashMap;
2631
import java.util.List;
32+
import java.util.Map;
2733

2834
import static org.elasticsearch.compute.data.BlockUtils.toJavaObject;
2935
import static org.elasticsearch.test.ESTestCase.between;
@@ -267,4 +273,67 @@ public static List<List<Object>> valuesAtPositions(Block block, int from, int to
267273
}
268274
return result;
269275
}
276+
277+
/**
278+
* Convert all of the {@link Block}s in a page that contain {@link BytesRef}s into
279+
* {@link OrdinalBytesRefBlock}s.
280+
*/
281+
public static Page convertBytesRefsToOrdinals(Page page) {
282+
Block[] blocks = new Block[page.getBlockCount()];
283+
try {
284+
for (int b = 0; b < page.getBlockCount(); b++) {
285+
Block block = page.getBlock(b);
286+
if (block.elementType() != ElementType.BYTES_REF) {
287+
blocks[b] = block;
288+
continue;
289+
}
290+
Map<BytesRef, Integer> dedupe = new HashMap<>();
291+
BytesRefBlock bytesRefBlock = (BytesRefBlock) block;
292+
try (
293+
IntBlock.Builder ordinals = block.blockFactory().newIntBlockBuilder(block.getPositionCount());
294+
BytesRefVector.Builder bytes = block.blockFactory().newBytesRefVectorBuilder(block.getPositionCount())
295+
) {
296+
BytesRef scratch = new BytesRef();
297+
for (int p = 0; p < block.getPositionCount(); p++) {
298+
int first = block.getFirstValueIndex(p);
299+
int count = block.getValueCount(p);
300+
if (count == 0) {
301+
ordinals.appendNull();
302+
continue;
303+
}
304+
if (count == 1) {
305+
BytesRef v = bytesRefBlock.getBytesRef(first, scratch);
306+
ordinals.appendInt(dedupe(dedupe, bytes, v));
307+
continue;
308+
}
309+
int end = first + count;
310+
ordinals.beginPositionEntry();
311+
for (int i = first; i < end; i++) {
312+
BytesRef v = bytesRefBlock.getBytesRef(i, scratch);
313+
ordinals.appendInt(dedupe(dedupe, bytes, v));
314+
}
315+
ordinals.endPositionEntry();
316+
}
317+
blocks[b] = new OrdinalBytesRefBlock(ordinals.build(), bytes.build());
318+
bytesRefBlock.decRef();
319+
}
320+
}
321+
Page p = new Page(blocks);
322+
Arrays.fill(blocks, null);
323+
return p;
324+
} finally {
325+
Releasables.close(blocks);
326+
}
327+
}
328+
329+
private static int dedupe(Map<BytesRef, Integer> dedupe, BytesRefVector.Builder bytes, BytesRef v) {
330+
Integer current = dedupe.get(v);
331+
if (current != null) {
332+
return current;
333+
}
334+
bytes.appendBytesRef(v);
335+
int o = dedupe.size();
336+
dedupe.put(v, o);
337+
return o;
338+
}
270339
}

x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/convert/ToStringFromCartesianPointEvaluator.java

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/convert/ToStringFromCartesianShapeEvaluator.java

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)