Skip to content

Commit 0eb8b5f

Browse files
nik9000gmjehovich
authored andcommitted
ESQL: Always estimate size for types (elastic#134701)
Remove the "unknown" size estimates for ESQL types - we were defaulting to 50 bytes each and that's just... always more wrong and making a bad estimate for each type.
1 parent 81d6700 commit 0eb8b5f

File tree

6 files changed

+39
-30
lines changed

6 files changed

+39
-30
lines changed

test/external-modules/esql-heap-attack/src/javaRestTest/java/org/elasticsearch/xpack/esql/heap_attack/HeapAttackIT.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -819,15 +819,16 @@ public void testLookupExplosionNoFetchManyMatches() throws IOException {
819819
}
820820

821821
public void testLookupExplosionBigString() throws IOException {
822-
int sensorDataCount = 150;
822+
int sensorDataCount = 500;
823823
int lookupEntries = 1;
824824
Map<?, ?> map = lookupExplosionBigString(sensorDataCount, lookupEntries);
825825
assertMap(map, matchesMap().extraOk().entry("values", List.of(List.of(sensorDataCount * lookupEntries))));
826826
}
827827

828828
public void testLookupExplosionBigStringManyMatches() throws IOException {
829-
// 500, 1 is enough to make it fail locally but some CI needs more
830-
assertCircuitBreaks(attempt -> lookupExplosionBigString(attempt * 500, 1));
829+
// 500, 1 is enough with a single node, but the serverless copy of this test uses many nodes.
830+
// So something like 5000, 10 is much more of a sure thing there.
831+
assertCircuitBreaks(attempt -> lookupExplosionBigString(attempt * 5000, 10));
831832
}
832833

833834
private Map<String, Object> lookupExplosion(

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/type/DataType.java

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import java.util.List;
2828
import java.util.Locale;
2929
import java.util.Map;
30-
import java.util.Optional;
30+
import java.util.Objects;
3131
import java.util.Set;
3232
import java.util.function.Function;
3333

@@ -145,7 +145,7 @@ public enum DataType {
145145
* Fields of this type are unsupported by any functions and are always
146146
* rendered as {@code null} in the response.
147147
*/
148-
UNSUPPORTED(builder().typeName("UNSUPPORTED").unknownSize()),
148+
UNSUPPORTED(builder().typeName("UNSUPPORTED").estimatedSize(1024)),
149149
/**
150150
* Fields that are always {@code null}, usually created with constant
151151
* {@code null} values.
@@ -238,15 +238,15 @@ public enum DataType {
238238
* Generally ESQL uses {@code keyword} fields as raw strings. So things like
239239
* {@code TO_STRING} will make a {@code keyword} field.
240240
*/
241-
KEYWORD(builder().esType("keyword").unknownSize().docValues()),
241+
KEYWORD(builder().esType("keyword").estimatedSize(50).docValues()),
242242
/**
243243
* String fields that are analyzed when the document is received and may be
244244
* cut into more than one token. Generally ESQL only sees {@code text} fields
245245
* when loaded from the index and ESQL will load these fields
246246
* <strong>without</strong> analysis. The {@code MATCH} operator can be used
247247
* to query these fields with analysis.
248248
*/
249-
TEXT(builder().esType("text").unknownSize()),
249+
TEXT(builder().esType("text").estimatedSize(1024)),
250250
/**
251251
* Millisecond precision date, stored as a 64-bit signed number.
252252
*/
@@ -267,8 +267,8 @@ public enum DataType {
267267
*/
268268
// 8.15.2-SNAPSHOT is 15 bytes, most are shorter, some can be longer
269269
VERSION(builder().esType("version").estimatedSize(15).docValues()),
270-
OBJECT(builder().esType("object").unknownSize()),
271-
SOURCE(builder().esType(SourceFieldMapper.NAME).unknownSize()),
270+
OBJECT(builder().esType("object").estimatedSize(1024)),
271+
SOURCE(builder().esType(SourceFieldMapper.NAME).estimatedSize(10 * 1024)),
272272
DATE_PERIOD(builder().typeName("DATE_PERIOD").estimatedSize(3 * Integer.BYTES)),
273273
TIME_DURATION(builder().typeName("TIME_DURATION").estimatedSize(Integer.BYTES + Long.BYTES)),
274274
// WKB for points is typically 21 bytes.
@@ -298,20 +298,20 @@ public enum DataType {
298298
* Every document in {@link IndexMode#TIME_SERIES} index will have a single value
299299
* for this field and the segments themselves are sorted on this value.
300300
*/
301-
TSID_DATA_TYPE(builder().esType("_tsid").unknownSize().docValues()),
301+
TSID_DATA_TYPE(builder().esType("_tsid").estimatedSize(Long.BYTES * 2).docValues()),
302302
/**
303303
* Fields with this type are the partial result of running a non-time-series aggregation
304304
* inside alongside time-series aggregations. These fields are not parsable from the
305305
* mapping and should be hidden from users.
306306
*/
307-
PARTIAL_AGG(builder().esType("partial_agg").unknownSize()),
307+
PARTIAL_AGG(builder().esType("partial_agg").estimatedSize(1024)),
308308

309309
AGGREGATE_METRIC_DOUBLE(builder().esType("aggregate_metric_double").estimatedSize(Double.BYTES * 3 + Integer.BYTES)),
310310

311311
/**
312312
* Fields with this type are dense vectors, represented as an array of double values.
313313
*/
314-
DENSE_VECTOR(builder().esType("dense_vector").unknownSize());
314+
DENSE_VECTOR(builder().esType("dense_vector").estimatedSize(4096));
315315

316316
/**
317317
* Types that are actively being built. These types are
@@ -341,7 +341,7 @@ public enum DataType {
341341

342342
private final String esType;
343343

344-
private final Optional<Integer> estimatedSize;
344+
private final int estimatedSize;
345345

346346
/**
347347
* True if the type represents a "whole number", as in, does <strong>not</strong> have a decimal part.
@@ -377,11 +377,10 @@ public enum DataType {
377377

378378
DataType(Builder builder) {
379379
String typeString = builder.typeName != null ? builder.typeName : builder.esType;
380-
assert builder.estimatedSize != null : "Missing size for type " + typeString;
381380
this.typeName = typeString.toLowerCase(Locale.ROOT);
382381
this.name = typeString.toUpperCase(Locale.ROOT);
383382
this.esType = builder.esType;
384-
this.estimatedSize = builder.estimatedSize;
383+
this.estimatedSize = Objects.requireNonNull(builder.estimatedSize, "estimated size is required");
385384
this.isWholeNumber = builder.isWholeNumber;
386385
this.isRationalNumber = builder.isRationalNumber;
387386
this.docValues = builder.docValues;
@@ -683,10 +682,21 @@ public boolean isNumeric() {
683682
}
684683

685684
/**
686-
* @return the estimated size, in bytes, of this data type. If there's no reasonable way to estimate the size,
687-
* the optional will be empty.
685+
* An estimate of the size of values of this type in a Block. All types must have an
686+
* estimate, and generally follow the following rules:
687+
* <ol>
688+
* <li>
689+
* If you know the precise size of a single element of this type, use that.
690+
* For example {@link #INTEGER} uses {@link Integer#BYTES}.
691+
* </li>
692+
* <li>
693+
* Overestimates are better than under-estimates. Over-estimates make less
694+
* efficient operations, but under-estimates make circuit breaker errors.
695+
* </li>
696+
* </ol>
697+
* @return the estimated size of this data type in bytes
688698
*/
689-
public Optional<Integer> estimatedSize() {
699+
public int estimatedSize() {
690700
return estimatedSize;
691701
}
692702

@@ -801,7 +811,7 @@ private static class Builder {
801811

802812
private String typeName;
803813

804-
private Optional<Integer> estimatedSize;
814+
private Integer estimatedSize;
805815

806816
/**
807817
* True if the type represents a "whole number", as in, does <strong>not</strong> have a decimal part.
@@ -848,13 +858,11 @@ Builder typeName(String typeName) {
848858
return this;
849859
}
850860

861+
/**
862+
* See {@link DataType#estimatedSize}.
863+
*/
851864
Builder estimatedSize(int size) {
852-
this.estimatedSize = Optional.of(size);
853-
return this;
854-
}
855-
856-
Builder unknownSize() {
857-
this.estimatedSize = Optional.empty();
865+
this.estimatedSize = size;
858866
return this;
859867
}
860868

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/plan/physical/EstimatesRowSize.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,6 @@ static int estimateSize(DataType dataType) {
106106
if (elementType == ElementType.UNKNOWN) {
107107
throw new EsqlIllegalArgumentException("[unknown] can't be the result of field extraction");
108108
}
109-
return dataType.estimatedSize().orElse(50);
109+
return dataType.estimatedSize();
110110
}
111111
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/type/EsqlDataTypeConverter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,8 @@ public static DataType commonType(DataType left, DataType right) {
459459
return KEYWORD;
460460
}
461461
if (left.isNumeric() && right.isNumeric()) {
462-
int lsize = left.estimatedSize().orElseThrow();
463-
int rsize = right.estimatedSize().orElseThrow();
462+
int lsize = left.estimatedSize();
463+
int rsize = right.estimatedSize();
464464
// if one is int
465465
if (left.isWholeNumber()) {
466466
// promote the highest int

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/PhysicalPlanOptimizerTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8018,7 +8018,7 @@ public void testReductionPlanForLimit() {
80188018
Tuple<PhysicalPlan, PhysicalPlan> plans = PlannerUtils.breakPlanBetweenCoordinatorAndDataNode(plan, config);
80198019
PhysicalPlan reduction = PlannerUtils.reductionPlan(plans.v2());
80208020
LimitExec limitExec = as(reduction, LimitExec.class);
8021-
assertThat(limitExec.estimatedRowSize(), equalTo(328));
8021+
assertThat(limitExec.estimatedRowSize(), equalTo(2276));
80228022
}
80238023

80248024
public void testEqualsPushdownToDelegate() {

x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/type/EsqlDataTypeConverterTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ private static void commonNumericType(DataType numericType, List<DataType> lower
141141
List<DataType> NUMERICS = Arrays.stream(DataType.values()).filter(DataType::isNumeric).toList();
142142
List<DataType> DOUBLES = Arrays.stream(DataType.values()).filter(DataType::isRationalNumber).toList();
143143
for (DataType dataType : DataType.values()) {
144-
if (DOUBLES.containsAll(List.of(numericType, dataType)) && (dataType.estimatedSize().equals(numericType.estimatedSize()))) {
144+
if (DOUBLES.containsAll(List.of(numericType, dataType)) && (dataType.estimatedSize() == numericType.estimatedSize())) {
145145
assertEquals(numericType, commonType(dataType, numericType));
146146
} else if (lowerTypes.contains(dataType)) {
147147
assertEqualsCommonType(numericType, dataType, numericType);

0 commit comments

Comments
 (0)