Skip to content

Commit ede15bb

Browse files
Kubik42elasticsearchmachine
andauthored
Don't store keyword multi fields when they trip ignore_above (#132962)
* Merged with main * Renamed Builder * Delete docs/changelog/134582.yaml * Cleaned up * Fixes * Removed redundant tests * Update docs/changelog/132962.yaml * Addressed feedback - renamed some functions, converted all snake case test names to camel case * Reverted the removal of fallback synthetic source in block loader * [CI] Auto commit changes from spotless * Fixed failing tests --------- Co-authored-by: elasticsearchmachine <[email protected]>
1 parent caff54f commit ede15bb

File tree

46 files changed

+1230
-454
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1230
-454
lines changed

docs/changelog/132962.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 132962
2+
summary: Don't store keyword multi fields when they trip `ignore_above`
3+
area: Mapping
4+
type: bug
5+
issues: []

modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java

Lines changed: 185 additions & 122 deletions
Large diffs are not rendered by default.

modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldTypeTests.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ public void testRangeIntervals() {
225225
);
226226
}
227227

228-
public void test_block_loader_uses_stored_fields_for_loading_when_synthetic_source_delegate_is_absent() {
228+
public void testBlockLoaderUsesStoredFieldsForLoadingWhenSyntheticSourceDelegateIsAbsent() {
229229
// given
230230
MatchOnlyTextFieldMapper.MatchOnlyTextFieldType ft = new MatchOnlyTextFieldMapper.MatchOnlyTextFieldType(
231231
"parent",
@@ -246,7 +246,7 @@ public void test_block_loader_uses_stored_fields_for_loading_when_synthetic_sour
246246
assertThat(blockLoader, Matchers.instanceOf(MatchOnlyTextFieldType.BytesFromMixedStringsBytesRefBlockLoader.class));
247247
}
248248

249-
public void test_block_loader_uses_synthetic_source_delegate_when_ignore_above_is_not_set() {
249+
public void testBlockLoaderUsesSyntheticSourceDelegateWhenIgnoreAboveIsNotSet() {
250250
// given
251251
KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate = new KeywordFieldMapper.KeywordFieldType(
252252
"child",
@@ -274,7 +274,7 @@ public void test_block_loader_uses_synthetic_source_delegate_when_ignore_above_i
274274
assertThat(blockLoader, Matchers.instanceOf(BlockLoader.Delegating.class));
275275
}
276276

277-
public void test_block_loader_does_not_use_synthetic_source_delegate_when_ignore_above_is_set() {
277+
public void testBlockLoaderDoesNotUseSyntheticSourceDelegateWhenIgnoreAboveIsSet() {
278278
// given
279279
Settings settings = Settings.builder()
280280
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
@@ -322,7 +322,7 @@ public void test_block_loader_does_not_use_synthetic_source_delegate_when_ignore
322322
assertThat(blockLoader, Matchers.not(Matchers.instanceOf(BlockLoader.Delegating.class)));
323323
}
324324

325-
public void test_block_loader_does_not_use_synthetic_source_delegate_when_ignore_above_is_set_at_index_level() {
325+
public void testBlockLoaderDoesNotUseSyntheticSourceDelegateWhenIgnoreAboveIsSetAtIndexLevel() {
326326
// given
327327
Settings settings = Settings.builder()
328328
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())

modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,114 @@ synthetic_source match_only_text with ignored multi-field:
650650
- match:
651651
hits.hits.0._source.foo: "Apache Lucene powers Elasticsearch"
652652

653+
---
654+
synthetic_source match_only_text with ignored multi-field and multiple values in the same doc:
655+
- requires:
656+
cluster_features: [ "mapper.source.mode_from_index_setting" ]
657+
reason: "Source mode configured through index setting"
658+
659+
- do:
660+
indices.create:
661+
index: synthetic_source_test
662+
body:
663+
settings:
664+
index:
665+
mapping.source.mode: synthetic
666+
mappings:
667+
properties:
668+
foo:
669+
type: match_only_text
670+
fields:
671+
raw:
672+
type: keyword
673+
ignore_above: 20
674+
675+
- do:
676+
index:
677+
index: synthetic_source_test
678+
id: "1"
679+
refresh: true
680+
body:
681+
foo: [ "This value is too long and will be ignored", "This value is short" ]
682+
683+
- do:
684+
search:
685+
index: synthetic_source_test
686+
body:
687+
query:
688+
match_phrase:
689+
foo: this value is
690+
691+
- match: { "hits.total.value": 1 }
692+
- match: { hits.hits.0._source.foo.0: "This value is too long and will be ignored" }
693+
- match: { hits.hits.0._source.foo.1: "This value is short" }
694+
695+
# now, flip the values around
696+
- do:
697+
index:
698+
index: synthetic_source_test
699+
id: "1"
700+
refresh: true
701+
body:
702+
foo: [ "This value is short", "This value is too long and will be ignored" ]
703+
704+
- do:
705+
search:
706+
index: synthetic_source_test
707+
body:
708+
query:
709+
match_phrase:
710+
foo: this value is
711+
712+
- match: { "hits.total.value": 1 }
713+
# the order will be the same since text fields currently don't take offsets into account
714+
- match: { hits.hits.0._source.foo.0: "This value is too long and will be ignored" }
715+
- match: { hits.hits.0._source.foo.1: "This value is short" }
716+
717+
---
718+
synthetic_source match_only_text with ignored multi-field and multiple values in the same doc and preserved order:
719+
- requires:
720+
cluster_features: [ "mapper.source.mode_from_index_setting" ]
721+
reason: "Source mode configured through index setting"
722+
723+
- do:
724+
indices.create:
725+
index: synthetic_source_test
726+
body:
727+
settings:
728+
index:
729+
mapping.source.mode: synthetic
730+
mappings:
731+
properties:
732+
foo:
733+
type: match_only_text
734+
# this will force the order to be preserved
735+
synthetic_source_keep: arrays
736+
fields:
737+
raw:
738+
type: keyword
739+
ignore_above: 20
740+
741+
- do:
742+
index:
743+
index: synthetic_source_test
744+
id: "1"
745+
refresh: true
746+
body:
747+
foo: [ "This value is short", "This value is too long and will be ignored" ]
748+
749+
- do:
750+
search:
751+
index: synthetic_source_test
752+
body:
753+
query:
754+
match_phrase:
755+
foo: this value is
756+
757+
- match: { "hits.total.value": 1 }
758+
- match: { hits.hits.0._source.foo.0: "This value is short" }
759+
- match: { hits.hits.0._source.foo.1: "This value is too long and will be ignored" }
760+
653761
---
654762
synthetic_source match_only_text with stored multi-field:
655763
- requires:

plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java

Lines changed: 88 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,20 @@
2020
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
2121
import org.apache.lucene.document.Field;
2222
import org.apache.lucene.document.FieldType;
23+
import org.apache.lucene.document.StoredField;
2324
import org.apache.lucene.index.IndexOptions;
2425
import org.elasticsearch.ElasticsearchParseException;
2526
import org.elasticsearch.index.IndexVersion;
2627
import org.elasticsearch.index.analysis.AnalyzerScope;
2728
import org.elasticsearch.index.analysis.IndexAnalyzers;
2829
import org.elasticsearch.index.analysis.NamedAnalyzer;
30+
import org.elasticsearch.index.mapper.CompositeSyntheticFieldLoader;
2931
import org.elasticsearch.index.mapper.DocumentParserContext;
3032
import org.elasticsearch.index.mapper.FieldMapper;
3133
import org.elasticsearch.index.mapper.KeywordFieldMapper;
3234
import org.elasticsearch.index.mapper.MapperBuilderContext;
3335
import org.elasticsearch.index.mapper.SourceFieldMapper;
36+
import org.elasticsearch.index.mapper.SourceLoader;
3437
import org.elasticsearch.index.mapper.StringStoredFieldFieldLoader;
3538
import org.elasticsearch.index.mapper.TextFieldMapper;
3639
import org.elasticsearch.index.mapper.TextParams;
@@ -78,7 +81,7 @@ private static NamedAnalyzer wrapAnalyzer(NamedAnalyzer in) {
7881
);
7982
}
8083

81-
public static class Builder extends FieldMapper.Builder {
84+
public static class Builder extends BuilderWithSyntheticSourceContext {
8285

8386
final Parameter<SimilarityProvider> similarity = TextParams.similarity(m -> builder(m).similarity.getValue());
8487
final Parameter<String> indexOptions = TextParams.textIndexOptions(m -> builder(m).indexOptions.getValue());
@@ -87,25 +90,29 @@ public static class Builder extends FieldMapper.Builder {
8790

8891
private final Parameter<Map<String, String>> meta = Parameter.metaParam();
8992

90-
private final IndexVersion indexCreatedVersion;
9193
private final TextParams.Analyzers analyzers;
92-
private final boolean isSyntheticSourceEnabled;
9394
private final Parameter<Boolean> store;
9495

95-
public Builder(String name, IndexVersion indexCreatedVersion, IndexAnalyzers indexAnalyzers, boolean isSyntheticSourceEnabled) {
96-
super(name);
97-
this.indexCreatedVersion = indexCreatedVersion;
96+
public Builder(
97+
String name,
98+
IndexVersion indexCreatedVersion,
99+
IndexAnalyzers indexAnalyzers,
100+
boolean isSyntheticSourceEnabled,
101+
boolean isWithinMultiField
102+
) {
103+
super(name, indexCreatedVersion, isSyntheticSourceEnabled, isWithinMultiField);
98104
this.analyzers = new TextParams.Analyzers(
99105
indexAnalyzers,
100106
m -> builder(m).analyzers.getIndexAnalyzer(),
101107
m -> builder(m).analyzers.positionIncrementGap.getValue(),
102108
indexCreatedVersion
103109
);
104-
this.isSyntheticSourceEnabled = isSyntheticSourceEnabled;
105-
this.store = Parameter.storeParam(
106-
m -> builder(m).store.getValue(),
107-
() -> isSyntheticSourceEnabled && multiFieldsBuilder.hasSyntheticSourceCompatibleKeywordField() == false
108-
);
110+
this.store = Parameter.storeParam(m -> builder(m).store.getValue(), () -> {
111+
if (TextFieldMapper.keywordMultiFieldsNotStoredWhenIgnoredIndexVersionCheck(indexCreatedVersion())) {
112+
return false;
113+
}
114+
return isSyntheticSourceEnabled() && multiFieldsBuilder.hasSyntheticSourceCompatibleKeywordField() == false;
115+
});
109116
}
110117

111118
@Override
@@ -135,6 +142,7 @@ private AnnotatedTextFieldType buildFieldType(FieldType fieldType, MapperBuilder
135142
store.getValue(),
136143
tsi,
137144
context.isSourceSynthetic(),
145+
isWithinMultiField(),
138146
TextFieldMapper.SyntheticSourceHelper.syntheticSourceDelegate(fieldType.stored(), multiFields),
139147
meta.getValue()
140148
);
@@ -165,7 +173,13 @@ public AnnotatedTextFieldMapper build(MapperBuilderContext context) {
165173
}
166174

167175
public static final TypeParser PARSER = new TypeParser(
168-
(n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers(), SourceFieldMapper.isSynthetic(c.getIndexSettings()))
176+
(n, c) -> new Builder(
177+
n,
178+
c.indexVersionCreated(),
179+
c.getIndexAnalyzers(),
180+
SourceFieldMapper.isSynthetic(c.getIndexSettings()),
181+
c.isWithinMultiField()
182+
)
169183
);
170184

171185
/**
@@ -484,15 +498,17 @@ private void emitAnnotation(int firstSpannedTextPosInc, int annotationPosLen) th
484498
}
485499

486500
public static final class AnnotatedTextFieldType extends TextFieldMapper.TextFieldType {
501+
487502
private AnnotatedTextFieldType(
488503
String name,
489504
boolean store,
490505
TextSearchInfo tsi,
491506
boolean isSyntheticSource,
507+
boolean isWithinMultiField,
492508
KeywordFieldMapper.KeywordFieldType syntheticSourceDelegate,
493509
Map<String, String> meta
494510
) {
495-
super(name, true, store, tsi, isSyntheticSource, syntheticSourceDelegate, meta, false, false);
511+
super(name, true, store, tsi, isSyntheticSource, isWithinMultiField, syntheticSourceDelegate, meta, false, false);
496512
}
497513

498514
public AnnotatedTextFieldType(String name, Map<String, String> meta) {
@@ -505,9 +521,9 @@ public String typeName() {
505521
}
506522
}
507523

524+
private final IndexVersion indexCreatedVersion;
508525
private final FieldType fieldType;
509526
private final Builder builder;
510-
511527
private final NamedAnalyzer indexAnalyzer;
512528

513529
protected AnnotatedTextFieldMapper(
@@ -518,10 +534,18 @@ protected AnnotatedTextFieldMapper(
518534
Builder builder
519535
) {
520536
super(simpleName, mappedFieldType, builderParams);
537+
521538
assert fieldType.tokenized();
539+
522540
this.fieldType = freezeAndDeduplicateFieldType(fieldType);
523541
this.builder = builder;
524542
this.indexAnalyzer = wrapAnalyzer(builder.analyzers.getIndexAnalyzer());
543+
this.indexCreatedVersion = builder.indexCreatedVersion();
544+
}
545+
546+
@Override
547+
public AnnotatedTextFieldType fieldType() {
548+
return (AnnotatedTextFieldType) super.fieldType();
525549
}
526550

527551
@Override
@@ -544,6 +568,26 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
544568
context.addToFieldNames(fieldType().name());
545569
}
546570
}
571+
572+
// if synthetic source needs to be supported, yet the field isn't stored, then we need to rely on something else
573+
if (needsToSupportSyntheticSource() && fieldType.stored() == false) {
574+
// if we can rely on the synthetic source delegate for synthetic source, then return
575+
if (fieldType().canUseSyntheticSourceDelegateForSyntheticSource(value)) {
576+
return;
577+
}
578+
579+
// otherwise, we need to store a copy of this value so that synthetic source can load it
580+
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
581+
context.doc().add(new StoredField(fieldName, value));
582+
}
583+
}
584+
585+
private boolean needsToSupportSyntheticSource() {
586+
if (TextFieldMapper.multiFieldsNotStoredByDefaultIndexVersionCheck(indexCreatedVersion)) {
587+
// if we're within a multi field, then supporting synthetic source isn't necessary as that's the responsibility of the parent
588+
return fieldType().isSyntheticSourceEnabled() && fieldType().isWithinMultiField() == false;
589+
}
590+
return fieldType().isSyntheticSourceEnabled();
547591
}
548592

549593
@Override
@@ -553,8 +597,13 @@ protected String contentType() {
553597

554598
@Override
555599
public FieldMapper.Builder getMergeBuilder() {
556-
return new Builder(leafName(), builder.indexCreatedVersion, builder.analyzers.indexAnalyzers, builder.isSyntheticSourceEnabled)
557-
.init(this);
600+
return new Builder(
601+
leafName(),
602+
builder.indexCreatedVersion(),
603+
builder.analyzers.indexAnalyzers,
604+
fieldType().isSyntheticSourceEnabled(),
605+
fieldType().isWithinMultiField()
606+
).init(this);
558607
}
559608

560609
@Override
@@ -568,11 +617,32 @@ protected void write(XContentBuilder b, Object value) throws IOException {
568617
});
569618
}
570619

620+
return new SyntheticSourceSupport.Native(() -> syntheticFieldLoader(fullPath(), leafName()));
621+
}
622+
623+
private SourceLoader.SyntheticFieldLoader syntheticFieldLoader(String fullFieldName, String leafFieldName) {
624+
// since we don't know whether the delegate field loader can be used for synthetic source until parsing, we need to check both this
625+
// field and the delegate
626+
627+
// first field loader - to check whether the field's value was stored under this match_only_text field
628+
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
629+
final var thisFieldLayer = new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
630+
@Override
631+
protected void writeValue(Object value, XContentBuilder b) throws IOException {
632+
b.value(value.toString());
633+
}
634+
};
635+
636+
final CompositeSyntheticFieldLoader fieldLoader = new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, thisFieldLayer);
637+
638+
// second loader - to check whether the field's value was stored by a keyword delegate field
571639
var kwd = TextFieldMapper.SyntheticSourceHelper.getKeywordFieldMapperForSyntheticSource(this);
572640
if (kwd != null) {
573-
return new SyntheticSourceSupport.Native(() -> kwd.syntheticFieldLoader(fullPath(), leafName()));
641+
// merge the two field loaders into one
642+
var kwdFieldLoader = kwd.syntheticFieldLoader(fullPath(), leafName());
643+
return fieldLoader.mergedWith(kwdFieldLoader);
574644
}
575645

576-
return super.syntheticSourceSupport();
646+
return fieldLoader;
577647
}
578648
}

0 commit comments

Comments
 (0)