Skip to content

Commit 27feccd

Browse files
not-napoleonelasticsearchmachine
andauthored
Histogram field data type (elastic#139457)
Add minimal support for a histogram field data type. This adds the type as "under construction" but not behind a feature flag; as with the previous PR, I don't see any need for that additional layer. As is tradition with new data types, most of this PR is test support. --------- Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
1 parent 06f6edf commit 27feccd

File tree

26 files changed

+331
-45
lines changed

26 files changed

+331
-45
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
9245000
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
jina_ai_embedding_dimensions_support_added,9244000
1+
esql_histogram_datatype,9245000

x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/RestEsqlIT.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,7 @@ public void testSuggestedCast() throws IOException {
792792
shouldBeSupported.remove(DataType.DENSE_VECTOR);
793793
shouldBeSupported.remove(DataType.EXPONENTIAL_HISTOGRAM); // TODO(b/133393): add support when blockloader is implemented
794794
shouldBeSupported.remove(DataType.TDIGEST);
795+
shouldBeSupported.remove(DataType.HISTOGRAM);
795796
if (EsqlCapabilities.Cap.AGGREGATE_METRIC_DOUBLE_V0.isEnabled() == false) {
796797
shouldBeSupported.remove(DataType.AGGREGATE_METRIC_DOUBLE);
797798
}

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/AllSupportedFieldsTestCase.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,20 @@ protected static void createAllTypesDoc(RestClient client, String indexName) thr
830830
}
831831
case EXPONENTIAL_HISTOGRAM -> ExponentialHistogramXContent.serialize(doc, EXPONENTIAL_HISTOGRAM_VALUE);
832832
case DENSE_VECTOR -> doc.value(List.of(0.5, 10, 6));
833+
case HISTOGRAM -> {
834+
doc.startObject();
835+
doc.startArray("values");
836+
doc.value(0.1);
837+
doc.value(0.2);
838+
doc.value(0.3);
839+
doc.endArray();
840+
doc.startArray("counts");
841+
doc.value(3);
842+
doc.value(7);
843+
doc.value(23);
844+
doc.endArray();
845+
doc.endObject();
846+
}
833847
default -> throw new AssertionError("unsupported field type [" + type + "]");
834848
}
835849
}

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/EsqlSpecTestCase.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ public void setup() {
187187
false,
188188
supportsExponentialHistograms(),
189189
supportsTDigestField(),
190+
supportsHistogramDataType(),
190191
supportsBFloat16ElementType()
191192
);
192193
return null;
@@ -319,6 +320,10 @@ protected boolean supportsTDigestField() {
319320
return RestEsqlTestCase.hasCapabilities(client(), List.of(EsqlCapabilities.Cap.TDIGEST_FIELD_TYPE_SUPPORT_V3.capabilityName()));
320321
}
321322

323+
protected boolean supportsHistogramDataType() {
324+
return RestEsqlTestCase.hasCapabilities(client(), List.of(EsqlCapabilities.Cap.HISTOGRAM_FIELD_SUPPORT_V0.capabilityName()));
325+
}
326+
322327
protected boolean supportsBFloat16ElementType() {
323328
return RestEsqlTestCase.hasCapabilities(client(), List.of(EsqlCapabilities.Cap.GENERIC_VECTOR_FORMAT.capabilityName()));
324329
}

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/generative/GenerativeRestTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ private static List<String> originalTypes(Map<String, ?> x) {
265265
}
266266

267267
private List<String> availableIndices() throws IOException {
268-
return availableDatasetsForEs(true, supportsSourceFieldMapping(), false, requiresTimeSeries(), false, false, false).stream()
268+
return availableDatasetsForEs(true, supportsSourceFieldMapping(), false, requiresTimeSeries(), false, false, false, false).stream()
269269
.filter(x -> x.requiresInferenceEndpoint() == false)
270270
.map(x -> x.indexName())
271271
.toList();

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import static org.elasticsearch.xpack.esql.core.util.SpatialCoordinateTypes.GEO;
4949
import static org.elasticsearch.xpack.esql.type.EsqlDataTypeConverter.aggregateMetricDoubleLiteralToString;
5050
import static org.elasticsearch.xpack.esql.type.EsqlDataTypeConverter.exponentialHistogramToString;
51+
import static org.elasticsearch.xpack.esql.type.EsqlDataTypeConverter.histogramToString;
5152
import static org.hamcrest.MatcherAssert.assertThat;
5253
import static org.hamcrest.Matchers.instanceOf;
5354
import static org.junit.Assert.assertEquals;
@@ -69,7 +70,7 @@ public static void assertMetadata(ExpectedResults expected, List<Map<String, Str
6970
var actualColumnNames = new ArrayList<String>(actualColumns.size());
7071
var actualColumnTypes = actualColumns.stream()
7172
.peek(c -> actualColumnNames.add(c.get("name")))
72-
.map(c -> CsvTestUtils.Type.asType(c.get("type")))
73+
.map(c -> Type.asType(c.get("type")))
7374
.toList();
7475
assertMetadata(expected, actualColumnNames, actualColumnTypes, List.of(), logger);
7576
}
@@ -226,16 +227,16 @@ public static void assertData(
226227
logger.info(row(actualValues, row));
227228
}
228229

229-
var expectedRow = expectedValues.get(row);
230-
var actualRow = actualValues.get(row);
230+
List<Object> expectedRow = expectedValues.get(row);
231+
List<Object> actualRow = actualValues.get(row);
231232

232233
for (int column = 0; column < expectedRow.size(); column++) {
233-
var expectedType = expected.columnTypes().get(column);
234-
var expectedValue = convertExpectedValue(expectedType, expectedRow.get(column));
235-
var actualValue = actualRow.get(column);
234+
Type expectedType = expected.columnTypes().get(column);
235+
Object expectedValue = convertExpectedValue(expectedType, expectedRow.get(column));
236+
Object actualValue = actualRow.get(column);
236237

237-
var transformedExpected = valueTransformer.apply(expectedType, expectedValue);
238-
var transformedActual = valueTransformer.apply(expectedType, actualValue);
238+
Object transformedExpected = valueTransformer.apply(expectedType, expectedValue);
239+
Object transformedActual = valueTransformer.apply(expectedType, actualValue);
239240
if (equals(transformedExpected, transformedActual) == false) {
240241
dataFailures.add(new DataFailure(row, column, transformedExpected, transformedActual));
241242
}
@@ -456,7 +457,9 @@ private static Object convertExpectedValue(Type expectedType, Object expectedVal
456457
ExponentialHistogram.class,
457458
x -> exponentialHistogramToString((ExponentialHistogram) x)
458459
);
459-
default -> expectedValue;
460+
case HISTOGRAM -> rebuildExpected(expectedValue, BytesRef.class, x -> histogramToString((BytesRef) x));
461+
case INTEGER, LONG, DOUBLE, FLOAT, HALF_FLOAT, SCALED_FLOAT, KEYWORD, TEXT, SEMANTIC_TEXT, IP_RANGE, INTEGER_RANGE,
462+
DOUBLE_RANGE, DATE_RANGE, NULL, BOOLEAN, DENSE_VECTOR, TDIGEST, UNSUPPORTED -> expectedValue;
460463
};
461464
}
462465

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.apache.lucene.util.BytesRef;
1212
import org.elasticsearch.Version;
1313
import org.elasticsearch.common.breaker.NoopCircuitBreaker;
14+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
1415
import org.elasticsearch.common.network.InetAddresses;
1516
import org.elasticsearch.common.time.DateFormatters;
1617
import org.elasticsearch.common.time.DateUtils;
@@ -38,6 +39,7 @@
3839
import org.elasticsearch.logging.Logger;
3940
import org.elasticsearch.search.aggregations.bucket.geogrid.GeoTileUtils;
4041
import org.elasticsearch.tdigest.parsing.TDigestParser;
42+
import org.elasticsearch.test.ESTestCase;
4143
import org.elasticsearch.test.VersionUtils;
4244
import org.elasticsearch.xcontent.XContentParser;
4345
import org.elasticsearch.xcontent.XContentParserConfiguration;
@@ -68,6 +70,7 @@
6870
import java.util.stream.Stream;
6971

7072
import static org.elasticsearch.common.logging.LoggerMessageFormat.format;
73+
import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
7174
import static org.elasticsearch.xpack.esql.EsqlTestUtils.reader;
7275
import static org.elasticsearch.xpack.esql.SpecReader.shouldSkipLine;
7376
import static org.elasticsearch.xpack.esql.core.type.DataTypeConverter.safeToUnsignedLong;
@@ -517,6 +520,7 @@ public enum Type {
517520
DENSE_VECTOR(Float::parseFloat, Float.class, false),
518521
EXPONENTIAL_HISTOGRAM(CsvTestUtils::parseExponentialHistogram, ExponentialHistogram.class),
519522
TDIGEST(CsvTestUtils::parseTDigest, TDigestHolder.class),
523+
HISTOGRAM(CsvTestUtils::parseHistogram, BytesRef.class),
520524
UNSUPPORTED(Type::convertUnsupported, Void.class);
521525

522526
private static Void convertUnsupported(String s) {
@@ -622,6 +626,7 @@ private static Type bytesRefBlockType(Type actualType) {
622626
return switch (actualType) {
623627
case NULL -> NULL;
624628
case GEO_POINT, CARTESIAN_POINT, GEO_SHAPE, CARTESIAN_SHAPE -> actualType;
629+
case HISTOGRAM -> HISTOGRAM;
625630
default -> KEYWORD;
626631
};
627632
}
@@ -760,4 +765,88 @@ private static TDigestHolder parseTDigest(@Nullable String json) {
760765
throw new IllegalArgumentException(e);
761766
}
762767
}
768+
769+
public static BytesRef parseHistogram(@Nullable String json) {
770+
if (json == null) {
771+
return null;
772+
}
773+
try (XContentParser parser = JsonXContent.jsonXContent.createParser(XContentParserConfiguration.EMPTY, json)) {
774+
if (parser.nextToken() != XContentParser.Token.START_OBJECT) {
775+
throw new IllegalArgumentException("Expected START_OBJECT but found: " + parser.currentToken());
776+
}
777+
parser.nextToken();
778+
// TODO: This is striaght up copied from HistogramParser. There are even fewer good places to put that for resue than
779+
// for TDigest, but maybe we can do some sensible refactoring down the road
780+
ArrayList<Double> values = null;
781+
ArrayList<Long> counts = null;
782+
XContentParser.Token token = parser.currentToken();
783+
while (token != XContentParser.Token.END_OBJECT) {
784+
// should be a field
785+
ensureExpectedToken(XContentParser.Token.FIELD_NAME, token, parser);
786+
String fieldName = parser.currentName();
787+
if (fieldName.equals("values")) {
788+
token = parser.nextToken();
789+
// should be an array
790+
ensureExpectedToken(XContentParser.Token.START_ARRAY, token, parser);
791+
values = new ArrayList<>();
792+
token = parser.nextToken();
793+
double previousVal = -Double.MAX_VALUE;
794+
while (token != XContentParser.Token.END_ARRAY) {
795+
// should be a number
796+
ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, token, parser);
797+
double val = parser.doubleValue();
798+
if (val < previousVal) {
799+
// values must be in increasing order
800+
ESTestCase.fail("Error parsing CSV histogram data, values out of order");
801+
}
802+
values.add(val);
803+
previousVal = val;
804+
token = parser.nextToken();
805+
}
806+
} else if (fieldName.equals("counts")) {
807+
token = parser.nextToken();
808+
// should be an array
809+
ensureExpectedToken(XContentParser.Token.START_ARRAY, token, parser);
810+
counts = new ArrayList<>();
811+
token = parser.nextToken();
812+
while (token != XContentParser.Token.END_ARRAY) {
813+
// should be a number
814+
ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, token, parser);
815+
long count = parser.longValue();
816+
if (count < 0) {
817+
ESTestCase.fail("Error parsing CSV histogram data, negative count");
818+
}
819+
counts.add(count);
820+
token = parser.nextToken();
821+
}
822+
} else {
823+
ESTestCase.fail("Error parsing CSV histogram data, unknown field: " + fieldName);
824+
}
825+
token = parser.nextToken();
826+
}
827+
if (values == null) {
828+
ESTestCase.fail("Error parsing CSV histogram data, no values field");
829+
}
830+
if (counts == null) {
831+
ESTestCase.fail("Error parsing CSV histogram data, no counts field");
832+
}
833+
if (values.size() != counts.size()) {
834+
ESTestCase.fail("expected counts and values to be same length but got [" + values.size() + " != " + counts.size() + "]");
835+
}
836+
BytesStreamOutput streamOutput = new BytesStreamOutput();
837+
for (int i = 0; i < values.size(); i++) {
838+
long count = counts.get(i);
839+
assert count >= 0;
840+
// we do not add elements with count == 0
841+
if (count > 0) {
842+
streamOutput.writeVLong(count);
843+
streamOutput.writeLong(Double.doubleToRawLongBits(values.get(i)));
844+
}
845+
}
846+
BytesRef docValue = streamOutput.bytes().toBytesRef();
847+
return docValue;
848+
} catch (IOException e) {
849+
throw new IllegalArgumentException(e);
850+
}
851+
}
763852
}

0 commit comments

Comments
 (0)