-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Encode args in a info doc value rather than using placeholders #132782
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
b9bff73
c6f9551
957b2ce
0727314
3969361
90cf48c
874fbf7
e422af9
ff5fae1
f235970
4c6aa6b
ed5bb78
ceeddb4
b0a3c59
68144ea
909adbd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,26 +7,96 @@ | |
|
||
package org.elasticsearch.xpack.logsdb.patternedtext; | ||
|
||
import org.apache.lucene.store.ByteArrayDataInput; | ||
import org.apache.lucene.store.ByteArrayDataOutput; | ||
import org.elasticsearch.common.Strings; | ||
import org.elasticsearch.common.hash.MurmurHash3; | ||
import org.elasticsearch.common.util.ByteUtils; | ||
|
||
import java.io.IOException; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Base64; | ||
import java.util.List; | ||
|
||
public class PatternedTextValueProcessor { | ||
private static final String TEXT_ARG_PLACEHOLDER = "%W"; | ||
private static final String ARG_PLACEHOLDER = "%"; | ||
private static final String DELIMITER = "[\\s\\[\\]]"; | ||
private static final String SPACE = " "; | ||
|
||
record Parts(String template, String templateId, List<String> args) { | ||
Parts(String template, List<String> args) { | ||
this(template, PatternedTextValueProcessor.templateId(template), args); | ||
record Parts(String template, String templateId, List<String> args, List<ArgSchema> schemas) { | ||
Parts(String template, List<String> args, List<ArgSchema> schemas) { | ||
this(template, PatternedTextValueProcessor.templateId(template), args, schemas); | ||
} | ||
|
||
Parts(String template, List<String> args, String encodedArgsSchema) throws IOException { | ||
this(template, PatternedTextValueProcessor.templateId(template), args, decodeArgumentSchema(encodedArgsSchema)); | ||
} | ||
} | ||
|
||
enum ArgType { | ||
GENERAL(0), | ||
IP4(1), | ||
INTEGER(2); | ||
|
||
private final int code; | ||
private static final ArgType[] lookup = new ArgType[values().length]; | ||
static { | ||
for (ArgType type : values()) { | ||
lookup[type.code] = type; | ||
} | ||
} | ||
|
||
ArgType(int code) { | ||
this.code = code; | ||
} | ||
|
||
public int toCode() { | ||
return code; | ||
} | ||
|
||
public static ArgType fromCode(int code) { | ||
return lookup[code]; | ||
} | ||
} | ||
|
||
record ArgSchema(ArgType type, int offsetInTemplate) { | ||
|
||
} | ||
|
||
static String encodeArgumentSchema(List<ArgSchema> arguments) throws IOException { | ||
int maxSize = Integer.BYTES + arguments.size() * (Integer.BYTES + Integer.BYTES); | ||
byte[] buffer = new byte[maxSize]; | ||
var dataInput = new ByteArrayDataOutput(buffer); | ||
dataInput.writeVInt(arguments.size()); | ||
for (var arg : arguments) { | ||
dataInput.writeVInt(arg.type.toCode()); | ||
dataInput.writeVInt(arg.offsetInTemplate); | ||
|
||
} | ||
|
||
int size = dataInput.getPosition(); | ||
byte[] data = Arrays.copyOfRange(buffer, 0, size); | ||
return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(data); | ||
} | ||
|
||
private static final Base64.Decoder DECODER = Base64.getUrlDecoder(); | ||
|
||
static List<ArgSchema> decodeArgumentSchema(String encoded) throws IOException { | ||
byte[] encodedBytes = DECODER.decode(encoded); | ||
var input = new ByteArrayDataInput(encodedBytes); | ||
|
||
int numArgs = input.readVInt(); | ||
List<ArgSchema> arguments = new ArrayList<>(numArgs); | ||
|
||
for (int i = 0; i < numArgs; i++) { | ||
var argType = ArgType.fromCode(input.readVInt()); | ||
int offsetInTemplate = input.readVInt(); | ||
arguments.add(new ArgSchema(argType, offsetInTemplate)); | ||
} | ||
return arguments; | ||
} | ||
|
||
static String templateId(String template) { | ||
byte[] bytes = template.getBytes(StandardCharsets.UTF_8); | ||
MurmurHash3.Hash128 hash = new MurmurHash3.Hash128(); | ||
|
@@ -36,33 +106,36 @@ static String templateId(String template) { | |
return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(hashBytes); | ||
} | ||
|
||
static Parts split(String text) { | ||
static Parts split(String text) throws IOException { | ||
StringBuilder template = new StringBuilder(); | ||
List<String> args = new ArrayList<>(); | ||
List<ArgSchema> schemas = new ArrayList<>(); | ||
String[] tokens = text.split(DELIMITER); | ||
int textIndex = 0; | ||
for (String token : tokens) { | ||
if (token.isEmpty()) { | ||
// add the previous delimiter | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering if we can use That may impact reconstruction performance, though. The logic is very streamlined, currently. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am inclined to reconstruct the original message exactly, at least for the time being. This simplify testing as no changes need to be taken into account when testing for equality between synthetic and stored source. |
||
if (textIndex < text.length() - 1) { | ||
template.append(text.charAt(textIndex++)); | ||
} | ||
continue; | ||
} | ||
if (isArg(token)) { | ||
args.add(token); | ||
template.append(TEXT_ARG_PLACEHOLDER); | ||
} else { | ||
template.append(token); | ||
} | ||
textIndex += token.length(); | ||
if (textIndex < text.length()) { | ||
template.append(text.charAt(textIndex++)); | ||
if (isArg(token)) { | ||
args.add(token); | ||
schemas.add(new ArgSchema(ArgType.GENERAL, template.length())); | ||
template.append(ARG_PLACEHOLDER); | ||
} else { | ||
template.append(token); | ||
} | ||
textIndex += token.length(); | ||
if (textIndex < text.length()) { | ||
template.append(text.charAt(textIndex++)); | ||
} | ||
} | ||
} | ||
while (textIndex < text.length()) { | ||
template.append(text.charAt(textIndex++)); | ||
} | ||
return new Parts(template.toString(), args); | ||
return new Parts(template.toString(), args, schemas); | ||
} | ||
|
||
private static boolean isArg(String text) { | ||
|
@@ -76,25 +149,20 @@ private static boolean isArg(String text) { | |
|
||
static String merge(Parts parts) { | ||
StringBuilder builder = new StringBuilder(); | ||
String[] templateParts = parts.template.split(DELIMITER); | ||
int i = 0; | ||
int templateIndex = 0; | ||
for (String part : templateParts) { | ||
if (part.equals(TEXT_ARG_PLACEHOLDER)) { | ||
builder.append(parts.args.get(i++)); | ||
templateIndex += TEXT_ARG_PLACEHOLDER.length(); | ||
} else if (part.isEmpty() == false) { | ||
builder.append(part); | ||
templateIndex += part.length(); | ||
} | ||
if (templateIndex < parts.template.length()) { | ||
builder.append(parts.template.charAt(templateIndex++)); | ||
} | ||
int numArgs = parts.args.size(); | ||
|
||
int lastWritten = 0; | ||
for (int i = 0; i < numArgs; i++) { | ||
String arg = parts.args.get(i); | ||
ArgSchema argSchema = parts.schemas.get(i); | ||
|
||
builder.append(parts.template, lastWritten, argSchema.offsetInTemplate); | ||
builder.append(arg); | ||
lastWritten = argSchema.offsetInTemplate + ARG_PLACEHOLDER.length(); | ||
} | ||
assert i == parts.args.size() : "expected " + i + " but got " + parts.args.size(); | ||
assert builder.toString().contains(TEXT_ARG_PLACEHOLDER) == false : builder.toString(); | ||
while (templateIndex < parts.template.length()) { | ||
builder.append(parts.template.charAt(templateIndex++)); | ||
|
||
if (lastWritten < parts.template.length()) { | ||
builder.append(parts.template.substring(lastWritten)); | ||
} | ||
return builder.toString(); | ||
} | ||
|
@@ -106,18 +174,4 @@ static String encodeRemainingArgs(Parts parts) { | |
static List<String> decodeRemainingArgs(String mergedArgs) { | ||
return Arrays.asList(mergedArgs.split(SPACE)); | ||
} | ||
|
||
static int countArgs(String template) { | ||
int count = 0; | ||
for (int i = 0; i < template.length() - 1; i++) { | ||
if (template.charAt(i) == '%') { | ||
char next = template.charAt(i + 1); | ||
if (next == 'W') { | ||
count++; | ||
i++; | ||
} | ||
} | ||
} | ||
return count; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think schema field name can be stored using
SortedDocValuesField
? Given thatencodeSchema (...)
stores the schemas as one value so store only one value per document?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally, they should be able to be stored as regular SortedDocValues. This is true for all the doc values columns in the patterned_text type. But I ran into an issue where a mapper test class defined in Lucene did not handle SortedDocValues correctly. I submitted this fix: apache/lucene#14839, and it has been merged. If we're using 10.3, I could go ahead and update all doc values in this type to SortedDocValues. But I'm inclined to do it in a separate PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cool, and thanks for fixing this in Lucene 🚀