Skip to content
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/138548.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 138548
summary: Store high-cardinality keyword fields in binary doc values
area: Mapping
type: feature
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.elasticsearch.index.mapper.TestDocumentParserContext;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.script.field.BinaryDocValuesField;
import org.elasticsearch.search.SearchModule;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.test.ESTestCase;
Expand Down Expand Up @@ -88,7 +89,7 @@ public void testStoringQueryBuilders() throws IOException {
when(searchExecutionContext.getWriteableRegistry()).thenReturn(writableRegistry());
when(searchExecutionContext.getParserConfig()).thenReturn(parserConfig());
when(searchExecutionContext.getForField(fieldMapper.fieldType(), fielddataOperation)).thenReturn(
new BytesBinaryIndexFieldData(fieldMapper.fullPath(), CoreValuesSourceType.KEYWORD)
new BytesBinaryIndexFieldData(fieldMapper.fullPath(), CoreValuesSourceType.KEYWORD, BinaryDocValuesField::new)
);
when(searchExecutionContext.getFieldType(Mockito.anyString())).thenAnswer(invocation -> {
final String fieldName = (String) invocation.getArguments()[0];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ setup:
keyword:
type: keyword
index: false
keyword_high_cardinality:
type: keyword
index: false
doc_values:
cardinality: high
boolean:
type: boolean
index: false
Expand All @@ -63,6 +68,7 @@ setup:
short: 1
date: "2017/01/01"
keyword: "key1"
keyword_high_cardinality: "key1"
boolean: "false"
ip: "192.168.0.1"
geo_point: [13.5, 34.89]
Expand All @@ -81,6 +87,7 @@ setup:
short: 2
date: "2017/01/02"
keyword: "key2"
keyword_high_cardinality: "key2"
boolean: "true"
ip: "192.168.0.2"
geo_point : [-63.24, 31.0]
Expand Down Expand Up @@ -271,6 +278,12 @@ setup:
body: { query: { match: { keyword: { query: "key1" } } } }
- length: { hits.hits: 1 }

- do:
search:
index: test
body: { query: { match: { keyword_high_cardinality: { "query": "key1" } } } }
- length: { hits.hits: 1 }

---
"Test terms query on keyword field where only doc values are enabled":

Expand All @@ -280,6 +293,11 @@ setup:
body: { query: { terms: { keyword: [ "key1", "key2" ] } } }
- length: { hits.hits: 2 }

- do:
search:
index: test
body: { query: { terms: { keyword_high_cardinality: [ "key1", "key2" ] } } }

---
"Test range query on keyword field where only doc values are enabled":

Expand All @@ -289,6 +307,12 @@ setup:
body: { query: { range: { keyword: { gte: "key1" } } } }
- length: { hits.hits: 2 }

- do:
search:
index: test
body: { query: { range: { keyword_high_cardinality: { gte: "key1" } } } }
- length: { hits.hits: 2 }

---
"Test fuzzy query on keyword field where only doc values are enabled":

Expand All @@ -298,6 +322,12 @@ setup:
body: { query: { fuzzy: { keyword: { value: "kay1", fuzziness: 1 } } } }
- length: { hits.hits: 1 }

- do:
search:
index: test
body: { query: { fuzzy: { keyword_high_cardinality: { value: "kay1", fuzziness: 1 } } } }
- length: { hits.hits: 1 }

---
"Test prefix query on keyword field where only doc values are enabled":

Expand All @@ -307,6 +337,12 @@ setup:
body: { query: { prefix: { keyword: { value: "key" } } } }
- length: { hits.hits: 2 }

- do:
search:
index: test
body: { query: { prefix: { keyword_high_cardinality: { value: "key" } } } }
- length: { hits.hits: 2 }

---
"Test case insensitive term query on keyword field where only doc values are enabled":

Expand All @@ -316,6 +352,12 @@ setup:
body: { query: { term: { keyword: { value: "KeY1", case_insensitive: true } } } }
- length: { hits.hits: 1 }

- do:
search:
index: test
body: { query: { term: { keyword_high_cardinality: { value: "KeY1", case_insensitive: true } } } }
- length: { hits.hits: 1 }

---
"Test wildcard query on keyword field where only doc values are enabled":

Expand All @@ -325,6 +367,12 @@ setup:
body: { query: { wildcard: { keyword: { value: "k*1" } } } }
- length: { hits.hits: 1 }

- do:
search:
index: test
body: { query: { wildcard: { keyword_high_cardinality: { value: "k*1" } } } }
- length: { hits.hits: 1 }

---
"Test case insensitive wildcard query on keyword field where only doc values are enabled":

Expand All @@ -334,6 +382,12 @@ setup:
body: { query: { wildcard: { keyword: { value: "K*1", case_insensitive: true } } } }
- length: { hits.hits: 1 }

- do:
search:
index: test
body: { query: { wildcard: { keyword_high_cardinality: { value: "K*1", case_insensitive: true } } } }
- length: { hits.hits: 1 }

---
"Test regexp query on keyword field where only doc values are enabled":

Expand All @@ -343,6 +397,12 @@ setup:
body: { query: { regexp: { keyword: { value: "k.*1" } } } }
- length: { hits.hits: 1 }

- do:
search:
index: test
body: { query: { regexp: { keyword_high_cardinality: { value: "k.*1" } } } }
- length: { hits.hits: 1 }

---
"Test match query on boolean field where only doc values are enabled":

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.index.fielddata;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;

import java.io.IOException;

public class MultiValuedSortedBinaryDocValues extends SortedBinaryDocValues {

BinaryDocValues values;
int count;
final ByteArrayStreamInput in = new ByteArrayStreamInput();
final BytesRef scratch = new BytesRef();

public MultiValuedSortedBinaryDocValues(BinaryDocValues values) {
this.values = values;
}

@Override
public boolean advanceExact(int doc) throws IOException {
if (values.advanceExact(doc)) {
final BytesRef bytes = values.binaryValue();
assert bytes.length > 0;
in.reset(bytes.bytes, bytes.offset, bytes.length);
count = in.readVInt();
scratch.bytes = bytes.bytes;
return true;
} else {
return false;
}
}

@Override
public int docValueCount() {
return count;
}

@Override
public BytesRef nextValue() throws IOException {
scratch.length = in.readVInt();
scratch.offset = in.getPosition();
in.setPosition(scratch.offset + scratch.length);
return scratch;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,10 @@
package org.elasticsearch.index.fielddata.plain;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
import org.elasticsearch.index.fielddata.LeafFieldData;
import org.elasticsearch.index.fielddata.MultiValuedSortedBinaryDocValues;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;

import java.io.IOException;

abstract class AbstractBinaryDVLeafFieldData implements LeafFieldData {
private final BinaryDocValues values;

Expand All @@ -32,40 +29,7 @@ public long ramBytesUsed() {

@Override
public SortedBinaryDocValues getBytesValues() {
return new SortedBinaryDocValues() {

int count;
final ByteArrayStreamInput in = new ByteArrayStreamInput();
final BytesRef scratch = new BytesRef();

@Override
public boolean advanceExact(int doc) throws IOException {
if (values.advanceExact(doc)) {
final BytesRef bytes = values.binaryValue();
assert bytes.length > 0;
in.reset(bytes.bytes, bytes.offset, bytes.length);
count = in.readVInt();
scratch.bytes = bytes.bytes;
return true;
} else {
return false;
}
}

@Override
public int docValueCount() {
return count;
}

@Override
public BytesRef nextValue() throws IOException {
scratch.length = in.readVInt();
scratch.offset = in.getPosition();
in.setPosition(scratch.offset + scratch.length);
return scratch;
}

};
return new MultiValuedSortedBinaryDocValues(values);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,20 @@
package org.elasticsearch.index.fielddata.plain;

import org.apache.lucene.index.BinaryDocValues;
import org.elasticsearch.script.field.BinaryDocValuesField;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.script.field.DocValuesScriptFieldFactory;
import org.elasticsearch.script.field.ToScriptFieldFactory;

final class BytesBinaryDVLeafFieldData extends AbstractBinaryDVLeafFieldData {
BytesBinaryDVLeafFieldData(BinaryDocValues values) {
private final ToScriptFieldFactory<SortedBinaryDocValues> toScriptFieldFactory;

BytesBinaryDVLeafFieldData(BinaryDocValues values, ToScriptFieldFactory<SortedBinaryDocValues> toScriptFieldFactory) {
super(values);
this.toScriptFieldFactory = toScriptFieldFactory;
}

@Override
public DocValuesScriptFieldFactory getScriptFieldFactory(String name) {
return new BinaryDocValuesField(getBytesValues(), name);
return toScriptFieldFactory.getScriptFieldFactory(getBytesValues(), name);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.script.field.ToScriptFieldFactory;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.MultiValueMode;
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
Expand All @@ -30,10 +32,16 @@ public class BytesBinaryIndexFieldData implements IndexFieldData<BytesBinaryDVLe

protected final String fieldName;
protected final ValuesSourceType valuesSourceType;
protected final ToScriptFieldFactory<SortedBinaryDocValues> toScriptFieldFactory;

public BytesBinaryIndexFieldData(String fieldName, ValuesSourceType valuesSourceType) {
public BytesBinaryIndexFieldData(
String fieldName,
ValuesSourceType valuesSourceType,
ToScriptFieldFactory<SortedBinaryDocValues> toScriptFieldFactory
) {
this.fieldName = fieldName;
this.valuesSourceType = valuesSourceType;
this.toScriptFieldFactory = toScriptFieldFactory;
}

@Override
Expand Down Expand Up @@ -68,7 +76,7 @@ public BucketedSort newBucketedSort(
@Override
public BytesBinaryDVLeafFieldData load(LeafReaderContext context) {
try {
return new BytesBinaryDVLeafFieldData(DocValues.getBinary(context.reader(), fieldName));
return new BytesBinaryDVLeafFieldData(DocValues.getBinary(context.reader(), fieldName), toScriptFieldFactory);
} catch (IOException e) {
throw new IllegalStateException("Cannot load doc values", e);
}
Expand All @@ -81,17 +89,19 @@ public BytesBinaryDVLeafFieldData loadDirect(LeafReaderContext context) {

public static class Builder implements IndexFieldData.Builder {
private final String name;
private final ToScriptFieldFactory<SortedBinaryDocValues> toScriptFieldFactory;
private final ValuesSourceType valuesSourceType;

public Builder(String name, ValuesSourceType valuesSourceType) {
public Builder(String name, ValuesSourceType valuesSourceType, ToScriptFieldFactory<SortedBinaryDocValues> toScriptFieldFactory) {
this.name = name;
this.valuesSourceType = valuesSourceType;
this.toScriptFieldFactory = toScriptFieldFactory;
}

@Override
public IndexFieldData<?> build(IndexFieldDataCache cache, CircuitBreakerService breakerService) {
// Ignore breaker
return new BytesBinaryIndexFieldData(name, valuesSourceType);
return new BytesBinaryIndexFieldData(name, valuesSourceType, toScriptFieldFactory);
}
}
}
Loading