diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/Arg.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/Arg.java new file mode 100644 index 0000000000000..249a5dccdabf0 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/Arg.java @@ -0,0 +1,122 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext; + +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataInput; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Base64; +import java.util.List; + +/** + * Describes the type and location of an argument in the template. A list of argument infos is encoded and stored in a doc value + * column, this is used to re-combine the template and argument columns. Documents with identical templates share the same + * of argument infos, and since indices are sorted by template_id, this doc value column compresses very well. + */ +public class Arg { + + private static final String SPACE = " "; + private static final Base64.Decoder DECODER = Base64.getUrlDecoder(); + private static final Base64.Encoder ENCODER = Base64.getUrlEncoder().withoutPadding(); + private static int VINT_MAX_BYTES = 5; + + public enum Type { + GENERIC(0); + + private final int code; + private static final Type[] lookup = new Type[values().length]; + static { + for (var type : values()) { + lookup[type.code] = type; + } + } + + Type(int code) { + this.code = code; + } + + public int toCode() { + return code; + } + + public static Type fromCode(int code) { + return lookup[code]; + } + } + + record Info(Type type, int offsetInTemplate) { + public Info { + assert offsetInTemplate >= 0; + } + + void writeTo(ByteArrayDataOutput out, int previousOffset) throws IOException { + out.writeVInt(type.toCode()); + int diff = offsetInTemplate - previousOffset; + out.writeVInt(diff); + } + + static Info readFrom(DataInput in, int previousOffset) throws IOException { + var type = Type.fromCode(in.readVInt()); + int diffFromPrevious = in.readVInt(); + int offsetInfoTemplate = previousOffset + diffFromPrevious; + return new Info(type, offsetInfoTemplate); + } + } + + static boolean isArg(String text) { + for (int i = 0; i < text.length(); i++) { + if (Character.isDigit(text.charAt(i))) { + return true; + } + } + return false; + } + + static String encodeInfo(List arguments) throws IOException { + int maxSize = VINT_MAX_BYTES + arguments.size() * (VINT_MAX_BYTES + VINT_MAX_BYTES); + byte[] buffer = new byte[maxSize]; + var dataInput = new ByteArrayDataOutput(buffer); + dataInput.writeVInt(arguments.size()); + int previousOffset = 0; + for (var arg : arguments) { + arg.writeTo(dataInput, previousOffset); + previousOffset = arg.offsetInTemplate; + } + + int size = dataInput.getPosition(); + byte[] data = Arrays.copyOfRange(buffer, 0, size); + return ENCODER.encodeToString(data); + } + + static List decodeInfo(String encoded) throws IOException { + byte[] encodedBytes = DECODER.decode(encoded); + var input = new ByteArrayDataInput(encodedBytes); + + int numArgs = input.readVInt(); + int previousOffset = 0; + List arguments = new ArrayList<>(numArgs); + for (int i = 0; i < numArgs; i++) { + var argInfo = Info.readFrom(input, previousOffset); + arguments.add(argInfo); + previousOffset = argInfo.offsetInTemplate; + } + return arguments; + } + + static String encodeRemainingArgs(PatternedTextValueProcessor.Parts parts) { + return String.join(SPACE, parts.args()); + } + + static String[] decodeRemainingArgs(String mergedArgs) { + return mergedArgs.split(SPACE); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextDocValues.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextDocValues.java index b7dfdc95683e5..c19a2e9aa9b31 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextDocValues.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextDocValues.java @@ -14,35 +14,42 @@ import org.apache.lucene.util.BytesRef; import java.io.IOException; +import java.util.List; public class PatternedTextDocValues extends BinaryDocValues { private final SortedSetDocValues templateDocValues; private final SortedSetDocValues argsDocValues; + private final SortedSetDocValues argsInfoDocValues; - PatternedTextDocValues(SortedSetDocValues templateDocValues, SortedSetDocValues argsDocValues) { + PatternedTextDocValues(SortedSetDocValues templateDocValues, SortedSetDocValues argsDocValues, SortedSetDocValues argsInfoDocValues) { this.templateDocValues = templateDocValues; this.argsDocValues = argsDocValues; + this.argsInfoDocValues = argsInfoDocValues; } - static PatternedTextDocValues from(LeafReader leafReader, String templateFieldName, String argsFieldName) throws IOException { + static PatternedTextDocValues from(LeafReader leafReader, String templateFieldName, String argsFieldName, String argsInfoFieldName) + throws IOException { SortedSetDocValues templateDocValues = DocValues.getSortedSet(leafReader, templateFieldName); if (templateDocValues.getValueCount() == 0) { return null; } SortedSetDocValues argsDocValues = DocValues.getSortedSet(leafReader, argsFieldName); - return new PatternedTextDocValues(templateDocValues, argsDocValues); + SortedSetDocValues argsInfoDocValues = DocValues.getSortedSet(leafReader, argsInfoFieldName); + return new PatternedTextDocValues(templateDocValues, argsDocValues, argsInfoDocValues); } private String getNextStringValue() throws IOException { assert templateDocValues.docValueCount() == 1; String template = templateDocValues.lookupOrd(templateDocValues.nextOrd()).utf8ToString(); - int argsCount = PatternedTextValueProcessor.countArgs(template); - if (argsCount > 0) { + List argsInfo = Arg.decodeInfo(argsInfoDocValues.lookupOrd(argsInfoDocValues.nextOrd()).utf8ToString()); + + if (argsInfo.isEmpty() == false) { assert argsDocValues.docValueCount() == 1; + assert argsInfoDocValues.docValueCount() == 1; var mergedArgs = argsDocValues.lookupOrd(argsDocValues.nextOrd()); - var args = PatternedTextValueProcessor.decodeRemainingArgs(mergedArgs.utf8ToString()); - return PatternedTextValueProcessor.merge(new PatternedTextValueProcessor.Parts(template, args)); + var args = Arg.decodeRemainingArgs(mergedArgs.utf8ToString()); + return PatternedTextValueProcessor.merge(template, args, argsInfo); } else { return template; } @@ -56,6 +63,7 @@ public BytesRef binaryValue() throws IOException { @Override public boolean advanceExact(int i) throws IOException { argsDocValues.advanceExact(i); + argsInfoDocValues.advanceExact(i); // If template has a value, then message has a value. We don't have to check args here, since there may not be args for the doc return templateDocValues.advanceExact(i); } @@ -69,7 +77,9 @@ public int docID() { public int nextDoc() throws IOException { int templateNext = templateDocValues.nextDoc(); var argsAdvance = argsDocValues.advance(templateNext); + var argsInfoAdvance = argsInfoDocValues.advance(templateNext); assert argsAdvance >= templateNext; + assert argsInfoAdvance == templateNext; return templateNext; } @@ -77,12 +87,14 @@ public int nextDoc() throws IOException { public int advance(int i) throws IOException { int templateAdvance = templateDocValues.advance(i); var argsAdvance = argsDocValues.advance(templateAdvance); + var argsInfoAdvance = argsInfoDocValues.advance(templateAdvance); assert argsAdvance >= templateAdvance; + assert argsInfoAdvance == templateAdvance; return templateAdvance; } @Override public long cost() { - return templateDocValues.cost() + argsDocValues.cost(); + return templateDocValues.cost() + argsDocValues.cost() + argsInfoDocValues.cost(); } } diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapper.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapper.java index 3e0bc05430835..0b82d5299ef39 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapper.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapper.java @@ -184,9 +184,13 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio // Add template_id doc_values context.doc().add(templateIdMapper.buildKeywordField(new BytesRef(parts.templateId()))); + // Add args Info + String argsInfoEncoded = Arg.encodeInfo(parts.argsInfo()); + context.doc().add(new SortedSetDocValuesField(fieldType().argsInfoFieldName(), new BytesRef(argsInfoEncoded))); + // Add args doc_values if (parts.args().isEmpty() == false) { - String remainingArgs = PatternedTextValueProcessor.encodeRemainingArgs(parts); + String remainingArgs = Arg.encodeRemainingArgs(parts); context.doc().add(new SortedSetDocValuesField(fieldType().argsFieldName(), new BytesRef(remainingArgs))); } } @@ -207,7 +211,12 @@ protected SyntheticSourceSupport syntheticSourceSupport() { () -> new CompositeSyntheticFieldLoader( leafName(), fullPath(), - new PatternedTextSyntheticFieldLoaderLayer(fieldType().name(), fieldType().templateFieldName(), fieldType().argsFieldName()) + new PatternedTextSyntheticFieldLoaderLayer( + fieldType().name(), + fieldType().templateFieldName(), + fieldType().argsFieldName(), + fieldType().argsInfoFieldName() + ) ) ); } diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldType.java index e23e1428fbe24..52c9986f017ff 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldType.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldType.java @@ -57,6 +57,7 @@ public class PatternedTextFieldType extends StringFieldType { private static final String TEMPLATE_SUFFIX = ".template"; private static final String TEMPLATE_ID_SUFFIX = ".template_id"; private static final String ARGS_SUFFIX = ".args"; + private static final String ARGS_INFO_SUFFIX = ".args_info"; public static final String CONTENT_TYPE = "patterned_text"; @@ -272,4 +273,8 @@ String argsFieldName() { return name() + ARGS_SUFFIX; } + String argsInfoFieldName() { + return name() + ARGS_INFO_SUFFIX; + } + } diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextIndexFieldData.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextIndexFieldData.java index 8e532a9dd5a3a..11a91736e348b 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextIndexFieldData.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextIndexFieldData.java @@ -75,7 +75,8 @@ public LeafFieldData loadDirect(LeafReaderContext context) throws IOException { PatternedTextDocValues docValues = PatternedTextDocValues.from( leafReader, fieldType.templateFieldName(), - fieldType.argsFieldName() + fieldType.argsFieldName(), + fieldType.argsInfoFieldName() ); return new LeafFieldData() { diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextSyntheticFieldLoaderLayer.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextSyntheticFieldLoaderLayer.java index f05fa31671cda..59c05fbc03efb 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextSyntheticFieldLoaderLayer.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextSyntheticFieldLoaderLayer.java @@ -19,12 +19,14 @@ class PatternedTextSyntheticFieldLoaderLayer implements CompositeSyntheticFieldL private final String name; private final String templateFieldName; private final String argsFieldName; + private final String argsInfoFieldName; private PatternedTextSyntheticFieldLoader loader; - PatternedTextSyntheticFieldLoaderLayer(String name, String templateFieldName, String argsFieldName) { + PatternedTextSyntheticFieldLoaderLayer(String name, String templateFieldName, String argsFieldName, String argsInfoFieldName) { this.name = name; this.templateFieldName = templateFieldName; this.argsFieldName = argsFieldName; + this.argsInfoFieldName = argsInfoFieldName; } @Override @@ -34,7 +36,7 @@ public long valueCount() { @Override public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { - var docValues = PatternedTextDocValues.from(leafReader, templateFieldName, argsFieldName); + var docValues = PatternedTextDocValues.from(leafReader, templateFieldName, argsFieldName, argsInfoFieldName); if (docValues == null) { return null; } diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessor.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessor.java index 70fdb97cfd6c3..00b8aaf232c6f 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessor.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessor.java @@ -11,22 +11,28 @@ import org.elasticsearch.common.hash.MurmurHash3; import org.elasticsearch.common.util.ByteUtils; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class PatternedTextValueProcessor { - private static final String TEXT_ARG_PLACEHOLDER = "%W"; private static final String DELIMITER = "[\\s\\[\\]]"; - private static final String SPACE = " "; - record Parts(String template, String templateId, List args) { - Parts(String template, List args) { - this(template, PatternedTextValueProcessor.templateId(template), args); + public record Parts(String template, String templateId, List args, List argsInfo) { + Parts(String template, List args, List argsInfo) { + this(template, PatternedTextValueProcessor.templateId(template), args, argsInfo); } } + public static int originalSize(String template, String[] args) { + int size = template.length(); + for (var arg : args) { + size += arg.length(); + } + return size; + } + static String templateId(String template) { byte[] bytes = template.getBytes(StandardCharsets.UTF_8); MurmurHash3.Hash128 hash = new MurmurHash3.Hash128(); @@ -36,88 +42,59 @@ static String templateId(String template) { return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(hashBytes); } - static Parts split(String text) { - StringBuilder template = new StringBuilder(); + static Parts split(String text) throws IOException { + StringBuilder template = new StringBuilder(text.length()); List args = new ArrayList<>(); + List argsInfo = new ArrayList<>(); String[] tokens = text.split(DELIMITER); int textIndex = 0; for (String token : tokens) { if (token.isEmpty()) { + // add the previous delimiter if (textIndex < text.length() - 1) { template.append(text.charAt(textIndex++)); } - continue; - } - if (isArg(token)) { - args.add(token); - template.append(TEXT_ARG_PLACEHOLDER); } else { - template.append(token); - } - textIndex += token.length(); - if (textIndex < text.length()) { - template.append(text.charAt(textIndex++)); + if (Arg.isArg(token)) { + args.add(token); + argsInfo.add(new Arg.Info(Arg.Type.GENERIC, template.length())); + } else { + template.append(token); + } + textIndex += token.length(); + if (textIndex < text.length()) { + template.append(text.charAt(textIndex++)); + } } } while (textIndex < text.length()) { template.append(text.charAt(textIndex++)); } - return new Parts(template.toString(), args); + return new Parts(template.toString(), args, argsInfo); } - private static boolean isArg(String text) { - for (int i = 0; i < text.length(); i++) { - if (Character.isDigit(text.charAt(i))) { - return true; - } - } - return false; + // For testing + public static String merge(Parts parts) { + return merge(parts.template, parts.args.toArray(String[]::new), parts.argsInfo); } - static String merge(Parts parts) { - StringBuilder builder = new StringBuilder(); - String[] templateParts = parts.template.split(DELIMITER); - int i = 0; - int templateIndex = 0; - for (String part : templateParts) { - if (part.equals(TEXT_ARG_PLACEHOLDER)) { - builder.append(parts.args.get(i++)); - templateIndex += TEXT_ARG_PLACEHOLDER.length(); - } else if (part.isEmpty() == false) { - builder.append(part); - templateIndex += part.length(); - } - if (templateIndex < parts.template.length()) { - builder.append(parts.template.charAt(templateIndex++)); - } - } - assert i == parts.args.size() : "expected " + i + " but got " + parts.args.size(); - assert builder.toString().contains(TEXT_ARG_PLACEHOLDER) == false : builder.toString(); - while (templateIndex < parts.template.length()) { - builder.append(parts.template.charAt(templateIndex++)); - } - return builder.toString(); - } + static String merge(String template, String[] args, List argsInfo) { + StringBuilder builder = new StringBuilder(originalSize(template, args)); + int numArgs = args.length; - static String encodeRemainingArgs(Parts parts) { - return String.join(SPACE, parts.args); - } + int nextToWrite = 0; + for (int i = 0; i < numArgs; i++) { + String arg = args[i]; + var argInfo = argsInfo.get(i); - static List decodeRemainingArgs(String mergedArgs) { - return Arrays.asList(mergedArgs.split(SPACE)); - } + builder.append(template, nextToWrite, argInfo.offsetInTemplate()); + builder.append(arg); + nextToWrite = argInfo.offsetInTemplate(); + } - static int countArgs(String template) { - int count = 0; - for (int i = 0; i < template.length() - 1; i++) { - if (template.charAt(i) == '%') { - char next = template.charAt(i + 1); - if (next == 'W') { - count++; - i++; - } - } + if (nextToWrite < template.length()) { + builder.append(template, nextToWrite, template.length()); } - return count; + return builder.toString(); } } diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/ArgTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/ArgTests.java new file mode 100644 index 0000000000000..9e97513d9bc33 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/ArgTests.java @@ -0,0 +1,49 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext; + +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class ArgTests extends ESTestCase { + + public void testInfoRoundTrip() throws IOException { + byte[] buf = new byte[15]; + var output = new ByteArrayDataOutput(buf); + int previousOffset = randomIntBetween(0, 100); + Arg.Info info = randomInfo(); + info.writeTo(output, previousOffset); + + var input = new ByteArrayDataInput(buf); + Arg.Info actual = Arg.Info.readFrom(input, previousOffset); + assertEquals(info, actual); + } + + public void testInfoListRoundTrip() throws IOException { + int numArgs = randomIntBetween(0, 100); + var infoList = new ArrayList(); + for (int i = 0; i < numArgs; i++) { + infoList.add(randomInfo()); + } + + String encoded = Arg.encodeInfo(infoList); + List actual = Arg.decodeInfo(encoded); + + assertEquals(infoList.size(), actual.size()); + assertArrayEquals(infoList.toArray(), actual.toArray()); + } + + Arg.Info randomInfo() { + return new Arg.Info(randomFrom(Arg.Type.values()), randomIntBetween(0, 10_000)); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternTextDocValuesTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternTextDocValuesTests.java index 85eeac12abfb6..20940e9d7b4e2 100644 --- a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternTextDocValuesTests.java +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternTextDocValuesTests.java @@ -12,6 +12,7 @@ import org.elasticsearch.test.ESTestCase; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; @@ -21,22 +22,25 @@ public class PatternTextDocValuesTests extends ESTestCase { - private static PatternedTextDocValues makeDocValueSparseArgs() { - var template = new SimpleSortedSetDocValues("%W dog", "cat", "%W mouse %W", "hat %W"); + private static PatternedTextDocValues makeDocValueSparseArgs() throws IOException { + var template = new SimpleSortedSetDocValues(removePlaceholders("% dog", "cat", "% mouse %", "hat %")); var args = new SimpleSortedSetDocValues("1", null, "2 3", "4"); - return new PatternedTextDocValues(template, args); + var info = new SimpleSortedSetDocValues(info(0), info(), info(0, 7), info(4)); + return new PatternedTextDocValues(template, args, info); } - private static PatternedTextDocValues makeDocValuesDenseArgs() { - var template = new SimpleSortedSetDocValues("%W moose", "%W goose %W", "%W mouse %W", "%W house"); + private static PatternedTextDocValues makeDocValuesDenseArgs() throws IOException { + var template = new SimpleSortedSetDocValues(removePlaceholders("% moose", "% goose %", "% mouse %", "% house")); var args = new SimpleSortedSetDocValues("1", "4 5", "2 3", "7"); - return new PatternedTextDocValues(template, args); + var info = new SimpleSortedSetDocValues(info(0), info(0, 7), info(0, 7), info(0)); + return new PatternedTextDocValues(template, args, info); } - private static PatternedTextDocValues makeDocValueMissingValues() { - var template = new SimpleSortedSetDocValues("%W cheddar", "cat", null, "%W cheese"); + private static PatternedTextDocValues makeDocValueMissingValues() throws IOException { + var template = new SimpleSortedSetDocValues(removePlaceholders("% cheddar", "cat", null, "% cheese")); var args = new SimpleSortedSetDocValues("1", null, null, "4"); - return new PatternedTextDocValues(template, args); + var info = new SimpleSortedSetDocValues(info(0), info(), info(), info(0)); + return new PatternedTextDocValues(template, args, info); } public void testNextDoc() throws IOException { @@ -171,4 +175,17 @@ public long cost() { return 1; } } + + private static String info(int... offsets) throws IOException { + List argsInfo = new ArrayList<>(); + for (var offset : offsets) { + argsInfo.add(new Arg.Info(Arg.Type.GENERIC, offset)); + } + return Arg.encodeInfo(argsInfo); + } + + // Placeholders are only included here to help in testing + private static String[] removePlaceholders(String... values) { + return Arrays.stream(values).map(s -> s == null ? null : s.replace("%", "")).toList().toArray(String[]::new); + } } diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapperTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapperTests.java index f61b8f7f8078a..8d4088cd77df0 100644 --- a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapperTests.java +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapperTests.java @@ -151,7 +151,8 @@ public void testDefaults() throws IOException { { List fields = doc.rootDoc().getFields("field.template_id"); assertEquals(1, fields.size()); - assertEquals("D3OycqSEnDM", fields.get(0).binaryValue().utf8ToString()); + // Template is an empty string, so the templateId hash has value AAAAAAAAAAA + assertEquals("AAAAAAAAAAA", fields.get(0).binaryValue().utf8ToString()); IndexableFieldType fieldType = fields.get(0).fieldType(); assertThat(fieldType.omitNorms(), equalTo(true)); assertFalse(fieldType.tokenized()); diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessorTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessorTests.java index fe496fdaeb558..dc674c4aa8853 100644 --- a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessorTests.java +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextValueProcessorTests.java @@ -10,13 +10,17 @@ import org.elasticsearch.test.ESTestCase; import org.hamcrest.Matchers; +import java.io.IOException; +import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; +import static org.hamcrest.Matchers.equalTo; + public class PatternedTextValueProcessorTests extends ESTestCase { - public void testEmpty() { + public void testEmpty() throws IOException { String text = ""; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); assertEquals(text, parts.template()); @@ -24,7 +28,7 @@ public void testEmpty() { assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testWhitespace() { + public void testWhitespace() throws IOException { String text = " "; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); assertEquals(text, parts.template()); @@ -32,43 +36,48 @@ public void testWhitespace() { assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testWithoutTimestamp() { + public void testWithoutTimestamp() throws IOException { String text = " some text with arg1 and 2arg2 and 333 "; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals(" some text with %W and %W and %W ", parts.template()); + assertEquals(" some text with and and ", parts.template()); assertThat(parts.args(), Matchers.contains("arg1", "2arg2", "333")); + assertThat(parts.argsInfo(), equalTo(info(16, 21, 26))); assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testWithTimestamp() { + public void testWithTimestamp() throws IOException { String text = " 2021-04-13T13:51:38.000Z some text with arg1 and arg2 and arg3"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals(" %W some text with %W and %W and %W", parts.template()); + assertEquals(" some text with and and ", parts.template()); assertThat(parts.args(), Matchers.contains("2021-04-13T13:51:38.000Z", "arg1", "arg2", "arg3")); + assertThat(parts.argsInfo(), equalTo(info(1, 17, 22, 27))); assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testWithDateSpaceTime() { + public void testWithDateSpaceTime() throws IOException { String text = " 2021-04-13 13:51:38 some text with arg1 and arg2 and arg3"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals(" %W %W some text with %W and %W and %W", parts.template()); + assertEquals(" some text with and and ", parts.template()); + assertThat(parts.argsInfo(), equalTo(info(1, 2, 18, 23, 28))); assertThat(parts.args(), Matchers.contains("2021-04-13", "13:51:38", "arg1", "arg2", "arg3")); assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testMalformedDate() { + public void testMalformedDate() throws IOException { String text = "2020/09/06 10:11:38 Using namespace: kubernetes-dashboard' | HTTP status: 400, message: [1:395]"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals("%W %W Using namespace: kubernetes-dashboard' | HTTP status: %W message: [%W]", parts.template()); + assertEquals(" Using namespace: kubernetes-dashboard' | HTTP status: message: []", parts.template()); + assertThat(parts.argsInfo(), equalTo(info(0, 1, 56, 67))); assertThat(parts.args(), Matchers.contains("2020/09/06", "10:11:38", "400,", "1:395")); assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testUUID() { + public void testUUID() throws IOException { String text = "[2020-08-18T00:58:56.751+00:00][15][2354][action_controller][INFO]: [18be2355-6306-4a00-9db9-f0696aa1a225] " + "some text with arg1 and arg2"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals("[%W][%W][%W][action_controller][INFO]: [%W] some text with %W and %W", parts.template()); + assertEquals("[][][][action_controller][INFO]: [] some text with and ", parts.template()); + assertThat(parts.argsInfo(), equalTo(info(1, 3, 5, 34, 51, 56))); assertThat( parts.args(), Matchers.contains("2020-08-18T00:58:56.751+00:00", "15", "2354", "18be2355-6306-4a00-9db9-f0696aa1a225", "arg1", "arg2") @@ -76,18 +85,20 @@ public void testUUID() { assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testIP() { + public void testIP() throws IOException { String text = "[2020-08-18T00:58:56.751+00:00][15][2354][action_controller][INFO]: from 94.168.152.150 and arg1"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals("[%W][%W][%W][action_controller][INFO]: from %W and %W", parts.template()); + assertEquals("[][][][action_controller][INFO]: from and ", parts.template()); + assertThat(parts.argsInfo(), equalTo(info(1, 3, 5, 38, 43))); assertThat(parts.args(), Matchers.contains("2020-08-18T00:58:56.751+00:00", "15", "2354", "94.168.152.150", "arg1")); assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testSecondDate() { + public void testSecondDate() throws IOException { String text = "[2020-08-18T00:58:56.751+00:00][15][2354][action_controller][INFO]: at 2020-08-18 00:58:56 +0000 and arg1"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals("[%W][%W][%W][action_controller][INFO]: at %W %W %W and %W", parts.template()); + assertEquals("[][][][action_controller][INFO]: at and ", parts.template()); + assertThat(parts.argsInfo(), equalTo(info(1, 3, 5, 36, 37, 38, 43))); assertThat( parts.args(), Matchers.contains("2020-08-18T00:58:56.751+00:00", "15", "2354", "2020-08-18", "00:58:56", "+0000", "arg1") @@ -95,27 +106,28 @@ public void testSecondDate() { assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testWithTimestamp1() { + public void testWithTimestampStartBrackets() throws IOException { String text = "[2020-08-18T00:58:56] Found 123 errors for service [cheddar1]"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals("[%W] Found %W errors for service [%W]", parts.template()); + assertEquals("[] Found errors for service []", parts.template()); + assertThat(parts.argsInfo(), equalTo(info(1, 9, 30))); assertThat(parts.args(), Matchers.contains("2020-08-18T00:58:56", "123", "cheddar1")); assertEquals(text, PatternedTextValueProcessor.merge(parts)); } - public void testTemplateIdIsExpectedShape() { + public void testTemplateIdIsExpectedShape() throws IOException { String text = "[2020-08-18T00:58:56] Found 123 errors for service [cheddar1]"; PatternedTextValueProcessor.Parts parts = PatternedTextValueProcessor.split(text); - assertEquals("vSr1YMYPups", parts.templateId()); + assertEquals("1l_PtCLQ5xY", parts.templateId()); } - public void testTemplateIdHasVeryFewCollisions() { + public void testTemplateIdHasVeryFewCollisions() throws IOException { Set templates = new HashSet<>(); Set ids = new HashSet<>(); for (int i = 0; i < 1000; i++) { var template = randomTemplate(); - var parts = new PatternedTextValueProcessor.Parts(template, List.of()); + var parts = new PatternedTextValueProcessor.Parts(template, List.of(), List.of()); templates.add(template); ids.add(parts.templateId()); } @@ -127,18 +139,18 @@ private static String randomTemplate() { StringBuilder sb = new StringBuilder(); int numTokens = randomIntBetween(1, 20); for (int i = 0; i < numTokens; i++) { - var token = randomBoolean() ? randomAlphaOfLength(between(1, 10)) : randomPlaceholder(); + var token = randomBoolean() ? randomAlphaOfLength(between(1, 10)) : ""; sb.append(token); sb.append(randomDelimiter()); } return sb.toString(); } - private static String randomPlaceholder() { - return randomFrom(List.of("%W", "%D", "%I", "%U", "%T")); - } - private static String randomDelimiter() { return randomFrom(List.of(" ", "\n", "\t", "[", "]")); } + + private static List info(int... offsets) throws IOException { + return Arrays.stream(offsets).mapToObj(o -> new Arg.Info(Arg.Type.GENERIC, o)).toList(); + } } diff --git a/x-pack/plugin/logsdb/src/yamlRestTest/resources/rest-api-spec/test/patternedtext/30_template_id.yml b/x-pack/plugin/logsdb/src/yamlRestTest/resources/rest-api-spec/test/patternedtext/30_template_id.yml index 4d63c8727b10e..16e911a6896d3 100644 --- a/x-pack/plugin/logsdb/src/yamlRestTest/resources/rest-api-spec/test/patternedtext/30_template_id.yml +++ b/x-pack/plugin/logsdb/src/yamlRestTest/resources/rest-api-spec/test/patternedtext/30_template_id.yml @@ -81,10 +81,10 @@ Get template_id field: values: ["2"] - match: { hits.total.value: 1 } - - match: { hits.hits.0.fields: {"foo.template_id": ["iJEgF75EQNk"]} } + - match: { hits.hits.0.fields: {"foo.template_id": ["mOVsnxlxdac"]} } --- -Sort by template_id index config setting: +Sort by template_id descending from index config setting: - do: search: index: test @@ -93,19 +93,19 @@ Sort by template_id index config setting: query: { match_all: {} } - match: { hits.total.value: 5 } - - match: { hits.hits.0.fields: {"foo.template_id": ["vSr1YMYPups"]} } - - match: { hits.hits.0._id: "3" } - - match: { hits.hits.1.fields: {"foo.template_id": ["k-2qtjujOCw"]} } - - match: { hits.hits.1._id: "4" } - - match: { hits.hits.2.fields: {"foo.template_id": ["iJEgF75EQNk"]} } - - match: { hits.hits.2._id: "2" } - - match: { hits.hits.3.fields: {"foo.template_id": ["iJEgF75EQNk"]} } - - match: { hits.hits.3._id: "5" } + - match: { hits.hits.0.fields: {"foo.template_id": ["mOVsnxlxdac"]} } + - match: { hits.hits.0._id: "2" } + - match: { hits.hits.1.fields: {"foo.template_id": ["mOVsnxlxdac"]} } + - match: { hits.hits.1._id: "5" } + - match: { hits.hits.2.fields: {"foo.template_id": ["k-2qtjujOCw"]} } + - match: { hits.hits.2._id: "4" } + - match: { hits.hits.3.fields: {"foo.template_id": ["1l_PtCLQ5xY"]} } + - match: { hits.hits.3._id: "3" } - match: { hits.hits.4.fields: null } - match: { hits.hits.4._id: "1" } --- -Sort by template_id: +Sort by template_id ascending from request: - do: search: @@ -116,14 +116,14 @@ Sort by template_id: query: { match_all: {} } - match: { hits.total.value: 5 } - - match: { hits.hits.0.fields: {"foo.template_id": ["iJEgF75EQNk"]} } - - match: { hits.hits.0._id: "2" } - - match: { hits.hits.1.fields: {"foo.template_id": ["iJEgF75EQNk"]} } - - match: { hits.hits.1._id: "5" } - - match: { hits.hits.2.fields: {"foo.template_id": ["k-2qtjujOCw"]} } - - match: { hits.hits.2._id: "4" } - - match: { hits.hits.3.fields: {"foo.template_id": ["vSr1YMYPups"]} } - - match: { hits.hits.3._id: "3" } + - match: { hits.hits.0.fields: {"foo.template_id": ["1l_PtCLQ5xY"]} } + - match: { hits.hits.0._id: "3" } + - match: { hits.hits.1.fields: {"foo.template_id": ["k-2qtjujOCw"]} } + - match: { hits.hits.1._id: "4" } + - match: { hits.hits.2.fields: {"foo.template_id": ["mOVsnxlxdac"]} } + - match: { hits.hits.2._id: "2" } + - match: { hits.hits.3.fields: {"foo.template_id": ["mOVsnxlxdac"]} } + - match: { hits.hits.3._id: "5" } - match: { hits.hits.4.fields: null } - match: { hits.hits.4._id: "1" } @@ -150,7 +150,7 @@ Match query: body: query: match: - foo.template_id: "iJEgF75EQNk" + foo.template_id: "mOVsnxlxdac" - match: { hits.total.value: 2 } - match: { hits.hits.0._score: 1.0 } @@ -165,11 +165,11 @@ Range query: query: range: foo.template_id: - gt: "j" - # one doc has a null value for template_id and two start with i < j + lt: "l" + # one doc has a null value for template_id and two start with m > l - match: { hits.total.value: 2 } - - match: { hits.hits.0._id: "3" } - - match: { hits.hits.1._id: "4" } + - match: { hits.hits.0._id: "4" } + - match: { hits.hits.1._id: "3" } --- Term aggregation: @@ -186,11 +186,11 @@ Term aggregation: - match: { hits.total.value: 5 } - length: { aggregations.template_id_agg.buckets: 3 } - - match: { aggregations.template_id_agg.buckets.0.key: "iJEgF75EQNk" } - - match: { aggregations.template_id_agg.buckets.0.doc_count: 2 } + - match: { aggregations.template_id_agg.buckets.0.key: "1l_PtCLQ5xY" } + - match: { aggregations.template_id_agg.buckets.0.doc_count: 1 } - match: { aggregations.template_id_agg.buckets.1.key: "k-2qtjujOCw" } - match: { aggregations.template_id_agg.buckets.1.doc_count: 1 } - - match: { aggregations.template_id_agg.buckets.2.key: "vSr1YMYPups" } - - match: { aggregations.template_id_agg.buckets.2.doc_count: 1 } + - match: { aggregations.template_id_agg.buckets.2.key: "mOVsnxlxdac" } + - match: { aggregations.template_id_agg.buckets.2.doc_count: 2 }