Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,44 @@
import org.apache.lucene.util.BytesRef;

import java.io.IOException;
import java.util.List;

public class PatternedTextDocValues extends BinaryDocValues {
private final SortedSetDocValues templateDocValues;
private final SortedSetDocValues argsDocValues;
private final SortedSetDocValues argsSchemaDocValues;

PatternedTextDocValues(SortedSetDocValues templateDocValues, SortedSetDocValues argsDocValues) {
PatternedTextDocValues(SortedSetDocValues templateDocValues, SortedSetDocValues argsDocValues, SortedSetDocValues argsSchemaDocValues) {
this.templateDocValues = templateDocValues;
this.argsDocValues = argsDocValues;
this.argsSchemaDocValues = argsSchemaDocValues;
}

static PatternedTextDocValues from(LeafReader leafReader, String templateFieldName, String argsFieldName) throws IOException {
static PatternedTextDocValues from(LeafReader leafReader, String templateFieldName, String argsFieldName, String argsSchemaFieldName)
throws IOException {
SortedSetDocValues templateDocValues = DocValues.getSortedSet(leafReader, templateFieldName);
if (templateDocValues.getValueCount() == 0) {
return null;
}

SortedSetDocValues argsDocValues = DocValues.getSortedSet(leafReader, argsFieldName);
return new PatternedTextDocValues(templateDocValues, argsDocValues);
SortedSetDocValues argsSchemaDocValues = DocValues.getSortedSet(leafReader, argsSchemaFieldName);
return new PatternedTextDocValues(templateDocValues, argsDocValues, argsSchemaDocValues);
}

private String getNextStringValue() throws IOException {
assert templateDocValues.docValueCount() == 1;
String template = templateDocValues.lookupOrd(templateDocValues.nextOrd()).utf8ToString();
int argsCount = PatternedTextValueProcessor.countArgs(template);
if (argsCount > 0) {
List<PatternedTextValueProcessor.ArgSchema> argsSchema = PatternedTextValueProcessor.decodeArgumentSchema(
argsSchemaDocValues.lookupOrd(argsSchemaDocValues.nextOrd()).utf8ToString()
);

if (argsSchema.isEmpty() == false) {
assert argsDocValues.docValueCount() == 1;
assert argsSchemaDocValues.docValueCount() == 1;
var mergedArgs = argsDocValues.lookupOrd(argsDocValues.nextOrd());
var args = PatternedTextValueProcessor.decodeRemainingArgs(mergedArgs.utf8ToString());
return PatternedTextValueProcessor.merge(new PatternedTextValueProcessor.Parts(template, args));
return PatternedTextValueProcessor.merge(new PatternedTextValueProcessor.Parts(template, args, argsSchema));
} else {
return template;
}
Expand All @@ -56,6 +65,7 @@ public BytesRef binaryValue() throws IOException {
@Override
public boolean advanceExact(int i) throws IOException {
argsDocValues.advanceExact(i);
argsSchemaDocValues.advanceExact(i);
// If template has a value, then message has a value. We don't have to check args here, since there may not be args for the doc
return templateDocValues.advanceExact(i);
}
Expand All @@ -69,20 +79,24 @@ public int docID() {
public int nextDoc() throws IOException {
int templateNext = templateDocValues.nextDoc();
var argsAdvance = argsDocValues.advance(templateNext);
var argsSchemaAdvance = argsSchemaDocValues.advance(templateNext);
assert argsAdvance >= templateNext;
assert argsSchemaAdvance == templateNext;
return templateNext;
}

@Override
public int advance(int i) throws IOException {
int templateAdvance = templateDocValues.advance(i);
var argsAdvance = argsDocValues.advance(templateAdvance);
var argsSchemaAdvance = argsSchemaDocValues.advance(templateAdvance);
assert argsAdvance >= templateAdvance;
assert argsSchemaAdvance == templateAdvance;
return templateAdvance;
}

@Override
public long cost() {
return templateDocValues.cost() + argsDocValues.cost();
return templateDocValues.cost() + argsDocValues.cost() + argsSchemaDocValues.cost();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ protected void parseCreateField(DocumentParserContext context) throws IOExceptio
// Add template_id doc_values
context.doc().add(templateIdMapper.buildKeywordField(new BytesRef(parts.templateId())));

// Add args schema
String argsSchemaEncoded = PatternedTextValueProcessor.encodeArgumentSchema(parts.schemas());
context.doc().add(new SortedSetDocValuesField(fieldType().argsSchemaFieldName(), new BytesRef(argsSchemaEncoded)));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think schema field name can be stored using SortedDocValuesField? Given that encodeSchema (...) stores the schemas as one value so store only one value per document?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, they should be able to be stored as regular SortedDocValues. This is true for all the doc values columns in the patterned_text type. But I ran into an issue where a mapper test class defined in Lucene did not handle SortedDocValues correctly. I submitted this fix: apache/lucene#14839, and it has been merged. If we're using 10.3, I could go ahead and update all doc values in this type to SortedDocValues. But I'm inclined to do it in a separate PR

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, and thanks for fixing this in Lucene 🚀


// Add args doc_values
if (parts.args().isEmpty() == false) {
String remainingArgs = PatternedTextValueProcessor.encodeRemainingArgs(parts);
Expand All @@ -207,7 +211,12 @@ protected SyntheticSourceSupport syntheticSourceSupport() {
() -> new CompositeSyntheticFieldLoader(
leafName(),
fullPath(),
new PatternedTextSyntheticFieldLoaderLayer(fieldType().name(), fieldType().templateFieldName(), fieldType().argsFieldName())
new PatternedTextSyntheticFieldLoaderLayer(
fieldType().name(),
fieldType().templateFieldName(),
fieldType().argsFieldName(),
fieldType().argsSchemaFieldName()
)
)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ public class PatternedTextFieldType extends StringFieldType {
private static final String TEMPLATE_SUFFIX = ".template";
private static final String TEMPLATE_ID_SUFFIX = ".template_id";
private static final String ARGS_SUFFIX = ".args";
private static final String ARGS_SCHEMA_SUFFIX = ".args_schema";

public static final String CONTENT_TYPE = "patterned_text";

Expand Down Expand Up @@ -272,4 +273,8 @@ String argsFieldName() {
return name() + ARGS_SUFFIX;
}

String argsSchemaFieldName() {
return name() + ARGS_SCHEMA_SUFFIX;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ public LeafFieldData loadDirect(LeafReaderContext context) throws IOException {
PatternedTextDocValues docValues = PatternedTextDocValues.from(
leafReader,
fieldType.templateFieldName(),
fieldType.argsFieldName()
fieldType.argsFieldName(),
fieldType.argsSchemaFieldName()
);
return new LeafFieldData() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ class PatternedTextSyntheticFieldLoaderLayer implements CompositeSyntheticFieldL
private final String name;
private final String templateFieldName;
private final String argsFieldName;
private final String argsSchemaFieldName;
private PatternedTextSyntheticFieldLoader loader;

PatternedTextSyntheticFieldLoaderLayer(String name, String templateFieldName, String argsFieldName) {
PatternedTextSyntheticFieldLoaderLayer(String name, String templateFieldName, String argsFieldName, String argsSchemaFieldName) {
this.name = name;
this.templateFieldName = templateFieldName;
this.argsFieldName = argsFieldName;
this.argsSchemaFieldName = argsSchemaFieldName;
}

@Override
Expand All @@ -34,7 +36,7 @@ public long valueCount() {

@Override
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
var docValues = PatternedTextDocValues.from(leafReader, templateFieldName, argsFieldName);
var docValues = PatternedTextDocValues.from(leafReader, templateFieldName, argsFieldName, argsSchemaFieldName);
if (docValues == null) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,96 @@

package org.elasticsearch.xpack.logsdb.patternedtext;

import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.hash.MurmurHash3;
import org.elasticsearch.common.util.ByteUtils;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.List;

public class PatternedTextValueProcessor {
private static final String TEXT_ARG_PLACEHOLDER = "%W";
private static final String ARG_PLACEHOLDER = "%";
private static final String DELIMITER = "[\\s\\[\\]]";
private static final String SPACE = " ";

record Parts(String template, String templateId, List<String> args) {
Parts(String template, List<String> args) {
this(template, PatternedTextValueProcessor.templateId(template), args);
record Parts(String template, String templateId, List<String> args, List<ArgSchema> schemas) {
Parts(String template, List<String> args, List<ArgSchema> schemas) {
this(template, PatternedTextValueProcessor.templateId(template), args, schemas);
}

Parts(String template, List<String> args, String encodedArgsSchema) throws IOException {
this(template, PatternedTextValueProcessor.templateId(template), args, decodeArgumentSchema(encodedArgsSchema));
}
}

enum ArgType {
GENERAL(0),
IP4(1),
INTEGER(2);

private final int code;
private static final ArgType[] lookup = new ArgType[values().length];
static {
for (ArgType type : values()) {
lookup[type.code] = type;
}
}

ArgType(int code) {
this.code = code;
}

public int toCode() {
return code;
}

public static ArgType fromCode(int code) {
return lookup[code];
}
}

record ArgSchema(ArgType type, int offsetInTemplate) {

}

static String encodeArgumentSchema(List<ArgSchema> arguments) throws IOException {
int maxSize = Integer.BYTES + arguments.size() * (Integer.BYTES + Integer.BYTES);
byte[] buffer = new byte[maxSize];
var dataInput = new ByteArrayDataOutput(buffer);
dataInput.writeVInt(arguments.size());
for (var arg : arguments) {
dataInput.writeVInt(arg.type.toCode());
dataInput.writeVInt(arg.offsetInTemplate);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need the offset? If we keep the generic placeholder in the template, we can get the offsets from the latter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did test adding the placeholder to the template, then scanning the template before message reconstruction to find offsets. This was quite fast in a micro-benchmark, only about 2x slower that the stored offsets in this PR. But this method does not deal with the other issue which this PR addresses: handling messages which contain the placeholder value. There are certainly ways we could deal with this, but those seems complicated.

}

int size = dataInput.getPosition();
byte[] data = Arrays.copyOfRange(buffer, 0, size);
return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(data);
}

private static final Base64.Decoder DECODER = Base64.getUrlDecoder();

static List<ArgSchema> decodeArgumentSchema(String encoded) throws IOException {
byte[] encodedBytes = DECODER.decode(encoded);
var input = new ByteArrayDataInput(encodedBytes);

int numArgs = input.readVInt();
List<ArgSchema> arguments = new ArrayList<>(numArgs);

for (int i = 0; i < numArgs; i++) {
var argType = ArgType.fromCode(input.readVInt());
int offsetInTemplate = input.readVInt();
arguments.add(new ArgSchema(argType, offsetInTemplate));
}
return arguments;
}

static String templateId(String template) {
byte[] bytes = template.getBytes(StandardCharsets.UTF_8);
MurmurHash3.Hash128 hash = new MurmurHash3.Hash128();
Expand All @@ -36,33 +106,36 @@ static String templateId(String template) {
return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(hashBytes);
}

static Parts split(String text) {
static Parts split(String text) throws IOException {
StringBuilder template = new StringBuilder();
List<String> args = new ArrayList<>();
List<ArgSchema> schemas = new ArrayList<>();
String[] tokens = text.split(DELIMITER);
int textIndex = 0;
for (String token : tokens) {
if (token.isEmpty()) {
// add the previous delimiter
Copy link
Contributor

@kkrik-es kkrik-es Aug 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering if we can use (double space) to track the presence of an arg, and replace all other whitespace with (single space) in the template. That will help avoid tracking offsets in arg schema and further compress the template, reducing the storage footprint. The reconstructed msg will look slightly different, but afaict whitespaces are barely used in term and phrase queries.

That may impact reconstruction performance, though. The logic is very streamlined, currently.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am inclined to reconstruct the original message exactly, at least for the time being. This simplify testing as no changes need to be taken into account when testing for equality between synthetic and stored source.

if (textIndex < text.length() - 1) {
template.append(text.charAt(textIndex++));
}
continue;
}
if (isArg(token)) {
args.add(token);
template.append(TEXT_ARG_PLACEHOLDER);
} else {
template.append(token);
}
textIndex += token.length();
if (textIndex < text.length()) {
template.append(text.charAt(textIndex++));
if (isArg(token)) {
args.add(token);
schemas.add(new ArgSchema(ArgType.GENERAL, template.length()));
template.append(ARG_PLACEHOLDER);
} else {
template.append(token);
}
textIndex += token.length();
if (textIndex < text.length()) {
template.append(text.charAt(textIndex++));
}
}
}
while (textIndex < text.length()) {
template.append(text.charAt(textIndex++));
}
return new Parts(template.toString(), args);
return new Parts(template.toString(), args, schemas);
}

private static boolean isArg(String text) {
Expand All @@ -76,25 +149,20 @@ private static boolean isArg(String text) {

static String merge(Parts parts) {
StringBuilder builder = new StringBuilder();
String[] templateParts = parts.template.split(DELIMITER);
int i = 0;
int templateIndex = 0;
for (String part : templateParts) {
if (part.equals(TEXT_ARG_PLACEHOLDER)) {
builder.append(parts.args.get(i++));
templateIndex += TEXT_ARG_PLACEHOLDER.length();
} else if (part.isEmpty() == false) {
builder.append(part);
templateIndex += part.length();
}
if (templateIndex < parts.template.length()) {
builder.append(parts.template.charAt(templateIndex++));
}
int numArgs = parts.args.size();

int lastWritten = 0;
for (int i = 0; i < numArgs; i++) {
String arg = parts.args.get(i);
ArgSchema argSchema = parts.schemas.get(i);

builder.append(parts.template, lastWritten, argSchema.offsetInTemplate);
builder.append(arg);
lastWritten = argSchema.offsetInTemplate + ARG_PLACEHOLDER.length();
}
assert i == parts.args.size() : "expected " + i + " but got " + parts.args.size();
assert builder.toString().contains(TEXT_ARG_PLACEHOLDER) == false : builder.toString();
while (templateIndex < parts.template.length()) {
builder.append(parts.template.charAt(templateIndex++));

if (lastWritten < parts.template.length()) {
builder.append(parts.template.substring(lastWritten));
}
return builder.toString();
}
Expand All @@ -106,18 +174,4 @@ static String encodeRemainingArgs(Parts parts) {
static List<String> decodeRemainingArgs(String mergedArgs) {
return Arrays.asList(mergedArgs.split(SPACE));
}

static int countArgs(String template) {
int count = 0;
for (int i = 0; i < template.length() - 1; i++) {
if (template.charAt(i) == '%') {
char next = template.charAt(i + 1);
if (next == 'W') {
count++;
i++;
}
}
}
return count;
}
}
Loading