|
7 | 7 |
|
8 | 8 | package org.elasticsearch.xpack.logsdb.patternedtext; |
9 | 9 |
|
| 10 | +import org.apache.logging.log4j.LogManager; |
| 11 | +import org.apache.logging.log4j.Logger; |
10 | 12 | import org.apache.lucene.analysis.CharArraySet; |
11 | 13 | import org.apache.lucene.document.Field; |
12 | 14 | import org.apache.lucene.document.FieldType; |
13 | 15 | import org.apache.lucene.document.SortedSetDocValuesField; |
14 | 16 | import org.apache.lucene.index.IndexOptions; |
| 17 | +import org.apache.lucene.index.IndexWriter; |
15 | 18 | import org.apache.lucene.util.BytesRef; |
16 | 19 | import org.elasticsearch.analysis.common.PatternAnalyzer; |
17 | 20 | import org.elasticsearch.common.regex.Regex; |
|
33 | 36 |
|
34 | 37 | import java.io.IOException; |
35 | 38 | import java.util.ArrayList; |
| 39 | +import java.util.Arrays; |
36 | 40 | import java.util.Iterator; |
37 | 41 | import java.util.List; |
38 | 42 | import java.util.Map; |
39 | 43 | import java.util.function.Function; |
40 | 44 | import java.util.function.Supplier; |
41 | 45 |
|
| 46 | +import static org.apache.lucene.index.IndexWriter.MAX_TERM_LENGTH; |
| 47 | + |
42 | 48 | /** |
43 | 49 | * A {@link FieldMapper} for full-text log fields that internally splits text into a low cardinality template component |
44 | 50 | * and high cardinality argument component. Separating these pieces allows the template component to be highly compressed. |
45 | 51 | */ |
46 | 52 | public class PatternedTextFieldMapper extends FieldMapper { |
47 | 53 |
|
| 54 | + private static final Logger logger = LogManager.getLogger(PatternedTextFieldMapper.class); |
| 55 | + |
48 | 56 | public static final FeatureFlag PATTERNED_TEXT_MAPPER = new FeatureFlag("patterned_text"); |
49 | 57 | private static final NamedAnalyzer ANALYZER; |
50 | 58 |
|
@@ -220,12 +228,33 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio |
220 | 228 |
|
221 | 229 | // Parse template and args |
222 | 230 | PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(value); |
| 231 | + BytesRef templateBytes = new BytesRef(parts.template()); |
| 232 | + if (templateBytes.length >= IndexWriter.MAX_TERM_LENGTH) { |
| 233 | + logger.error( |
| 234 | + "pattern text template is longer than allowed maximum term length.\n Template={}\n Original value:{}", |
| 235 | + templateBytes, |
| 236 | + value |
| 237 | + ); |
| 238 | + byte[] prefix = new byte[30]; |
| 239 | + System.arraycopy(templateBytes.bytes, templateBytes.offset, prefix, 0, 30); |
| 240 | + String msg = "pattern text template is longer than allowed maximum term length=\"" |
| 241 | + + fieldType().name() |
| 242 | + + "\" (whose " |
| 243 | + + "UTF8 encoding is longer than the max length " |
| 244 | + + MAX_TERM_LENGTH |
| 245 | + + "), all of which were " |
| 246 | + + "skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense " |
| 247 | + + "term is: '" |
| 248 | + + Arrays.toString(prefix) |
| 249 | + + "...'"; |
| 250 | + throw new IllegalArgumentException(msg); |
| 251 | + } |
223 | 252 |
|
224 | 253 | // Add index on original value |
225 | 254 | context.doc().add(new Field(fieldType().name(), value, fieldType)); |
226 | 255 |
|
227 | 256 | // Add template doc_values |
228 | | - context.doc().add(new SortedSetDocValuesField(fieldType().templateFieldName(), new BytesRef(parts.template()))); |
| 257 | + context.doc().add(new SortedSetDocValuesField(fieldType().templateFieldName(), templateBytes)); |
229 | 258 |
|
230 | 259 | // Add template_id doc_values |
231 | 260 | context.doc().add(templateIdMapper.buildKeywordField(new BytesRef(parts.templateId()))); |
|
0 commit comments