Skip to content

Commit 3b25b97

Browse files
Encode args in a info doc value rather than using placeholders (elastic#132782)
The existing method of determining where to insert arguments in the template is by replacing known placeholder values which were inserted in the template string. For example, the message found 5 errors would be separated into the argument 5, and the template found %W errors, where %W is the placeholder. There are a few problems with this method. First, we need special handling if the original message contains a placeholder string. We could handle this with some sort of escape, but this adds complexity, and costs time during ingestion. The second issue is that scanning for placeholders within the template string is slow: it is much faster to reconstruct the original message if we already know the location of the arguments in the template string. This PR adds a new doc value column which encodes the location of all arguments in the template. For each argument, it stores the offset in the template string and the type of the argument. There is currently only one GENERIC argument type. These values are encoded in a base64 encoded string stored as SortedSetDocValues. Since messages with the same template will have arguments at the same location, and indices are sorted by template_id, this field compresses very well.
1 parent fe6ded2 commit 3b25b97

File tree

12 files changed

+352
-145
lines changed

12 files changed

+352
-145
lines changed
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.logsdb.patternedtext;
9+
10+
import org.apache.lucene.store.ByteArrayDataInput;
11+
import org.apache.lucene.store.ByteArrayDataOutput;
12+
import org.apache.lucene.store.DataInput;
13+
14+
import java.io.IOException;
15+
import java.util.ArrayList;
16+
import java.util.Arrays;
17+
import java.util.Base64;
18+
import java.util.List;
19+
20+
/**
21+
* Describes the type and location of an argument in the template. A list of argument infos is encoded and stored in a doc value
22+
* column, this is used to re-combine the template and argument columns. Documents with identical templates share the same
23+
* of argument infos, and since indices are sorted by template_id, this doc value column compresses very well.
24+
*/
25+
public class Arg {
26+
27+
private static final String SPACE = " ";
28+
private static final Base64.Decoder DECODER = Base64.getUrlDecoder();
29+
private static final Base64.Encoder ENCODER = Base64.getUrlEncoder().withoutPadding();
30+
private static int VINT_MAX_BYTES = 5;
31+
32+
public enum Type {
33+
GENERIC(0);
34+
35+
private final int code;
36+
private static final Type[] lookup = new Type[values().length];
37+
static {
38+
for (var type : values()) {
39+
lookup[type.code] = type;
40+
}
41+
}
42+
43+
Type(int code) {
44+
this.code = code;
45+
}
46+
47+
public int toCode() {
48+
return code;
49+
}
50+
51+
public static Type fromCode(int code) {
52+
return lookup[code];
53+
}
54+
}
55+
56+
record Info(Type type, int offsetInTemplate) {
57+
public Info {
58+
assert offsetInTemplate >= 0;
59+
}
60+
61+
void writeTo(ByteArrayDataOutput out, int previousOffset) throws IOException {
62+
out.writeVInt(type.toCode());
63+
int diff = offsetInTemplate - previousOffset;
64+
out.writeVInt(diff);
65+
}
66+
67+
static Info readFrom(DataInput in, int previousOffset) throws IOException {
68+
var type = Type.fromCode(in.readVInt());
69+
int diffFromPrevious = in.readVInt();
70+
int offsetInfoTemplate = previousOffset + diffFromPrevious;
71+
return new Info(type, offsetInfoTemplate);
72+
}
73+
}
74+
75+
static boolean isArg(String text) {
76+
for (int i = 0; i < text.length(); i++) {
77+
if (Character.isDigit(text.charAt(i))) {
78+
return true;
79+
}
80+
}
81+
return false;
82+
}
83+
84+
static String encodeInfo(List<Info> arguments) throws IOException {
85+
int maxSize = VINT_MAX_BYTES + arguments.size() * (VINT_MAX_BYTES + VINT_MAX_BYTES);
86+
byte[] buffer = new byte[maxSize];
87+
var dataInput = new ByteArrayDataOutput(buffer);
88+
dataInput.writeVInt(arguments.size());
89+
int previousOffset = 0;
90+
for (var arg : arguments) {
91+
arg.writeTo(dataInput, previousOffset);
92+
previousOffset = arg.offsetInTemplate;
93+
}
94+
95+
int size = dataInput.getPosition();
96+
byte[] data = Arrays.copyOfRange(buffer, 0, size);
97+
return ENCODER.encodeToString(data);
98+
}
99+
100+
static List<Info> decodeInfo(String encoded) throws IOException {
101+
byte[] encodedBytes = DECODER.decode(encoded);
102+
var input = new ByteArrayDataInput(encodedBytes);
103+
104+
int numArgs = input.readVInt();
105+
int previousOffset = 0;
106+
List<Info> arguments = new ArrayList<>(numArgs);
107+
for (int i = 0; i < numArgs; i++) {
108+
var argInfo = Info.readFrom(input, previousOffset);
109+
arguments.add(argInfo);
110+
previousOffset = argInfo.offsetInTemplate;
111+
}
112+
return arguments;
113+
}
114+
115+
static String encodeRemainingArgs(PatternedTextValueProcessor.Parts parts) {
116+
return String.join(SPACE, parts.args());
117+
}
118+
119+
static String[] decodeRemainingArgs(String mergedArgs) {
120+
return mergedArgs.split(SPACE);
121+
}
122+
}

x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextDocValues.java

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,35 +14,42 @@
1414
import org.apache.lucene.util.BytesRef;
1515

1616
import java.io.IOException;
17+
import java.util.List;
1718

1819
public class PatternedTextDocValues extends BinaryDocValues {
1920
private final SortedSetDocValues templateDocValues;
2021
private final SortedSetDocValues argsDocValues;
22+
private final SortedSetDocValues argsInfoDocValues;
2123

22-
PatternedTextDocValues(SortedSetDocValues templateDocValues, SortedSetDocValues argsDocValues) {
24+
PatternedTextDocValues(SortedSetDocValues templateDocValues, SortedSetDocValues argsDocValues, SortedSetDocValues argsInfoDocValues) {
2325
this.templateDocValues = templateDocValues;
2426
this.argsDocValues = argsDocValues;
27+
this.argsInfoDocValues = argsInfoDocValues;
2528
}
2629

27-
static PatternedTextDocValues from(LeafReader leafReader, String templateFieldName, String argsFieldName) throws IOException {
30+
static PatternedTextDocValues from(LeafReader leafReader, String templateFieldName, String argsFieldName, String argsInfoFieldName)
31+
throws IOException {
2832
SortedSetDocValues templateDocValues = DocValues.getSortedSet(leafReader, templateFieldName);
2933
if (templateDocValues.getValueCount() == 0) {
3034
return null;
3135
}
3236

3337
SortedSetDocValues argsDocValues = DocValues.getSortedSet(leafReader, argsFieldName);
34-
return new PatternedTextDocValues(templateDocValues, argsDocValues);
38+
SortedSetDocValues argsInfoDocValues = DocValues.getSortedSet(leafReader, argsInfoFieldName);
39+
return new PatternedTextDocValues(templateDocValues, argsDocValues, argsInfoDocValues);
3540
}
3641

3742
private String getNextStringValue() throws IOException {
3843
assert templateDocValues.docValueCount() == 1;
3944
String template = templateDocValues.lookupOrd(templateDocValues.nextOrd()).utf8ToString();
40-
int argsCount = PatternedTextValueProcessor.countArgs(template);
41-
if (argsCount > 0) {
45+
List<Arg.Info> argsInfo = Arg.decodeInfo(argsInfoDocValues.lookupOrd(argsInfoDocValues.nextOrd()).utf8ToString());
46+
47+
if (argsInfo.isEmpty() == false) {
4248
assert argsDocValues.docValueCount() == 1;
49+
assert argsInfoDocValues.docValueCount() == 1;
4350
var mergedArgs = argsDocValues.lookupOrd(argsDocValues.nextOrd());
44-
var args = PatternedTextValueProcessor.decodeRemainingArgs(mergedArgs.utf8ToString());
45-
return PatternedTextValueProcessor.merge(new PatternedTextValueProcessor.Parts(template, args));
51+
var args = Arg.decodeRemainingArgs(mergedArgs.utf8ToString());
52+
return PatternedTextValueProcessor.merge(template, args, argsInfo);
4653
} else {
4754
return template;
4855
}
@@ -56,6 +63,7 @@ public BytesRef binaryValue() throws IOException {
5663
@Override
5764
public boolean advanceExact(int i) throws IOException {
5865
argsDocValues.advanceExact(i);
66+
argsInfoDocValues.advanceExact(i);
5967
// If template has a value, then message has a value. We don't have to check args here, since there may not be args for the doc
6068
return templateDocValues.advanceExact(i);
6169
}
@@ -69,20 +77,24 @@ public int docID() {
6977
public int nextDoc() throws IOException {
7078
int templateNext = templateDocValues.nextDoc();
7179
var argsAdvance = argsDocValues.advance(templateNext);
80+
var argsInfoAdvance = argsInfoDocValues.advance(templateNext);
7281
assert argsAdvance >= templateNext;
82+
assert argsInfoAdvance == templateNext;
7383
return templateNext;
7484
}
7585

7686
@Override
7787
public int advance(int i) throws IOException {
7888
int templateAdvance = templateDocValues.advance(i);
7989
var argsAdvance = argsDocValues.advance(templateAdvance);
90+
var argsInfoAdvance = argsInfoDocValues.advance(templateAdvance);
8091
assert argsAdvance >= templateAdvance;
92+
assert argsInfoAdvance == templateAdvance;
8193
return templateAdvance;
8294
}
8395

8496
@Override
8597
public long cost() {
86-
return templateDocValues.cost() + argsDocValues.cost();
98+
return templateDocValues.cost() + argsDocValues.cost() + argsInfoDocValues.cost();
8799
}
88100
}

x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldMapper.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,13 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
184184
// Add template_id doc_values
185185
context.doc().add(templateIdMapper.buildKeywordField(new BytesRef(parts.templateId())));
186186

187+
// Add args Info
188+
String argsInfoEncoded = Arg.encodeInfo(parts.argsInfo());
189+
context.doc().add(new SortedSetDocValuesField(fieldType().argsInfoFieldName(), new BytesRef(argsInfoEncoded)));
190+
187191
// Add args doc_values
188192
if (parts.args().isEmpty() == false) {
189-
String remainingArgs = PatternedTextValueProcessor.encodeRemainingArgs(parts);
193+
String remainingArgs = Arg.encodeRemainingArgs(parts);
190194
context.doc().add(new SortedSetDocValuesField(fieldType().argsFieldName(), new BytesRef(remainingArgs)));
191195
}
192196
}
@@ -207,7 +211,12 @@ protected SyntheticSourceSupport syntheticSourceSupport() {
207211
() -> new CompositeSyntheticFieldLoader(
208212
leafName(),
209213
fullPath(),
210-
new PatternedTextSyntheticFieldLoaderLayer(fieldType().name(), fieldType().templateFieldName(), fieldType().argsFieldName())
214+
new PatternedTextSyntheticFieldLoaderLayer(
215+
fieldType().name(),
216+
fieldType().templateFieldName(),
217+
fieldType().argsFieldName(),
218+
fieldType().argsInfoFieldName()
219+
)
211220
)
212221
);
213222
}

x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextFieldType.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ public class PatternedTextFieldType extends StringFieldType {
5757
private static final String TEMPLATE_SUFFIX = ".template";
5858
private static final String TEMPLATE_ID_SUFFIX = ".template_id";
5959
private static final String ARGS_SUFFIX = ".args";
60+
private static final String ARGS_INFO_SUFFIX = ".args_info";
6061

6162
public static final String CONTENT_TYPE = "patterned_text";
6263

@@ -272,4 +273,8 @@ String argsFieldName() {
272273
return name() + ARGS_SUFFIX;
273274
}
274275

276+
String argsInfoFieldName() {
277+
return name() + ARGS_INFO_SUFFIX;
278+
}
279+
275280
}

x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextIndexFieldData.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ public LeafFieldData loadDirect(LeafReaderContext context) throws IOException {
7575
PatternedTextDocValues docValues = PatternedTextDocValues.from(
7676
leafReader,
7777
fieldType.templateFieldName(),
78-
fieldType.argsFieldName()
78+
fieldType.argsFieldName(),
79+
fieldType.argsInfoFieldName()
7980
);
8081
return new LeafFieldData() {
8182

x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/PatternedTextSyntheticFieldLoaderLayer.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ class PatternedTextSyntheticFieldLoaderLayer implements CompositeSyntheticFieldL
1919
private final String name;
2020
private final String templateFieldName;
2121
private final String argsFieldName;
22+
private final String argsInfoFieldName;
2223
private PatternedTextSyntheticFieldLoader loader;
2324

24-
PatternedTextSyntheticFieldLoaderLayer(String name, String templateFieldName, String argsFieldName) {
25+
PatternedTextSyntheticFieldLoaderLayer(String name, String templateFieldName, String argsFieldName, String argsInfoFieldName) {
2526
this.name = name;
2627
this.templateFieldName = templateFieldName;
2728
this.argsFieldName = argsFieldName;
29+
this.argsInfoFieldName = argsInfoFieldName;
2830
}
2931

3032
@Override
@@ -34,7 +36,7 @@ public long valueCount() {
3436

3537
@Override
3638
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
37-
var docValues = PatternedTextDocValues.from(leafReader, templateFieldName, argsFieldName);
39+
var docValues = PatternedTextDocValues.from(leafReader, templateFieldName, argsFieldName, argsInfoFieldName);
3840
if (docValues == null) {
3941
return null;
4042
}

0 commit comments

Comments
 (0)