Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
01c4420
Initial integration of the char parser
eyalkoren Aug 14, 2025
6ce693c
Update docs/changelog/132921.yaml
eyalkoren Aug 14, 2025
9bc6233
Using modern switch instanceof in SubTokenEvaluatorFactory
eyalkoren Aug 14, 2025
87a1501
Removing copy of ByteUtils class
eyalkoren Aug 14, 2025
96d920a
Adding basic microbenchmark
eyalkoren Aug 17, 2025
99e74d0
Extracting api package and add javadoc
eyalkoren Aug 17, 2025
2f73866
Adding data loss detection mechamism
eyalkoren Aug 18, 2025
6f21cd5
Adding ParseException
eyalkoren Aug 18, 2025
a4f8d3e
[CI] Auto commit changes from spotless
Aug 18, 2025
0a018c5
First phase handling trimmed characters
eyalkoren Aug 21, 2025
02ab8b0
Merge remote-tracking branch 'eyalkoren/introducing-charparser' into …
eyalkoren Aug 21, 2025
532ecde
Cleaner Parser.parse() interface
eyalkoren Aug 24, 2025
c045e71
Merge remote-tracking branch 'upstream/main' into introducing-charparser
eyalkoren Aug 24, 2025
ea3509c
Some javadoc and minor refactor
eyalkoren Aug 25, 2025
68855bf
Merge remote-tracking branch 'upstream/main' into introducing-charparser
eyalkoren Sep 2, 2025
ff38c24
Merge remote-tracking branch 'upstream/main' into introducing-charparser
eyalkoren Oct 30, 2025
46a871c
Fixing improper string sub-token bitmask evaluation
eyalkoren Nov 5, 2025
633a4fe
Merge remote-tracking branch 'upstream/main' into introducing-charparser
eyalkoren Nov 5, 2025
0f6c917
Fixing tests and renaming
eyalkoren Nov 6, 2025
207fc1e
Refactor trimmed characters to token boundary characters
eyalkoren Nov 6, 2025
163af88
Merge remote-tracking branch 'upstream/main' into introducing-charparser
eyalkoren Nov 10, 2025
d28d296
Add support for boundary character + prefixes/suffixes + enhance algo…
eyalkoren Nov 12, 2025
878c430
[CI] Auto commit changes from spotless
Nov 12, 2025
3d5ff38
Support signed integers
eyalkoren Nov 12, 2025
5ee2edf
Merge remote-tracking branch 'eyalkoren/introducing-charparser' into …
eyalkoren Nov 12, 2025
23eaebf
Major algorithm enhancement around buffering
eyalkoren Nov 13, 2025
90a0ed2
[CI] Auto commit changes from spotless
Nov 13, 2025
65eb805
Fix small bug
eyalkoren Nov 16, 2025
b30f482
Add support for floating point numbers
eyalkoren Nov 18, 2025
76672aa
Merge remote-tracking branch 'upstream/main' into introducing-charparser
eyalkoren Nov 18, 2025
92e025a
Adding PatternTextValueProcessor to benchmark
eyalkoren Nov 18, 2025
07333f3
Add support to some more timestamp formats
eyalkoren Nov 18, 2025
f4a8b19
Optimizing BitmaskRegistry#getHigherLevelBitmaskByPosition
eyalkoren Nov 18, 2025
4dacd20
Disabling timestamp parsing on all benchmarks
eyalkoren Nov 18, 2025
974713d
[CI] Auto commit changes from spotless
Nov 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ dependencies {
api(project(':x-pack:plugin:esql'))
api(project(':x-pack:plugin:esql:compute'))
api(project(':x-pack:plugin:mapper-exponential-histogram'))
api(project(':x-pack:plugin:logsdb'))
implementation project(path: ':libs:native')
implementation project(path: ':libs:simdvec')
implementation project(path: ':libs:exponential-histogram')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.benchmark.index.mapper;

import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Argument;
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IPv4Argument;
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IntegerArgument;
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParseException;
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser;
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParserFactory;
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Timestamp;
import org.elasticsearch.xpack.logsdb.patterntext.PatternTextValueProcessor;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Run using the following command: ./gradlew -p benchmarks run --args 'PatternedTextParserBenchmark'
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(1)
public class PatternedTextParserBenchmark {

private Parser parser;
private RegexParser regexParser;
private String testMessage;
@SuppressWarnings("FieldCanBeLocal") // used for measurement of timestamp parsing overhead
private DateTimeFormatter dateTimeFormatter;

@Setup
public void setup() {
parser = ParserFactory.createParser();
regexParser = new RegexParser();
testMessage = "Oct 05, 2023 02:48:00 PM INFO Response from 127.0.0.1 took 2000 ms";
dateTimeFormatter = DateTimeFormatter.ofPattern("MMM dd, yyyy hh:mm:ss a").withLocale(java.util.Locale.US);
}

@Benchmark
public void parseWithCharParser(Blackhole blackhole) throws ParseException {
List<Argument<?>> arguments = parser.parse(testMessage);
blackhole.consume(arguments);
}

@Benchmark
public void parseWithRegexParser(Blackhole blackhole) throws ParseException {
List<Argument<?>> arguments = regexParser.parse(testMessage);
blackhole.consume(arguments);
}

@Benchmark
public void parseWithSimpleParser(Blackhole blackhole) throws ParseException {
PatternTextValueProcessor.Parts parts = PatternTextValueProcessor.split(testMessage);
blackhole.consume(parts);
// long timestamp = TimestampFormat.parseTimestamp(dateTimeFormatter, "Oct 05, 2023 02:48:00 PM");
// blackhole.consume(timestamp);
}

private static class RegexParser implements Parser {

private static final Pattern IPV4_PATTERN = Pattern.compile("\\b(\\d{1,3}(?:\\.\\d{1,3}){3})\\b");
private static final Pattern INTEGER_PATTERN = Pattern.compile("\\b\\d+\\b");

// New timestamp pattern and format
private static final Pattern TIMESTAMP_1_PATTERN = Pattern.compile(
"\\b\\d{2}/[A-Za-z]{3}/\\d{4}:\\d{2}:\\d{2}:\\d{2} [+-]\\d{4}\\b"
);
private static final String TIMESTAMP_1_FORMAT = "dd/MMM/yyyy:HH:mm:ss Z";
private static final DateTimeFormatter TIMESTAMP_1_FORMATTER = DateTimeFormatter.ofPattern(TIMESTAMP_1_FORMAT, Locale.ENGLISH);

// Existing timestamp pattern and format
private static final Pattern TIMESTAMP_2_PATTERN = Pattern.compile(
"\\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \\d{2}, \\d{4} \\d{2}:\\d{2}:\\d{2} (?:AM|PM)\\b"
);
private static final String TIMESTAMP_2_FORMAT = "MMM dd, yyyy hh:mm:ss a";
private static final DateTimeFormatter TIMESTAMP_2_FORMATTER = DateTimeFormatter.ofPattern(TIMESTAMP_2_FORMAT, Locale.ENGLISH);

/**
* Checks if a position range overlaps with any existing argument in the list
* @param arguments List of existing arguments
* @param startPos Start position of the range to check
* @param length Length of the range to check
* @return true if there is an overlap, false otherwise
*/
private boolean isOverlappingWithExistingArguments(List<Argument<?>> arguments, int startPos, int length) {
int endPos = startPos + length;
for (Argument<?> arg : arguments) {
int argStart = arg.startPosition();
int argEnd = argStart + arg.length();

// Check if ranges overlap
if ((startPos <= argEnd) && (endPos >= argStart)) {
return true;
}
}
return false;
}

@Override
public List<Argument<?>> parse(String rawMessage) throws ParseException {
if (rawMessage == null || rawMessage.isEmpty()) {
throw new IllegalArgumentException("rawMessage cannot be null or empty");
}

List<Argument<?>> arguments = new ArrayList<>();

// 1. Find and extract timestamp substring (prefer TIMESTAMP_1, then TIMESTAMP_2)
int tsStart = -1, tsEnd = -1;
String tsString = null;
DateTimeFormatter usedFormatter = null;

Matcher ts1Matcher = TIMESTAMP_1_PATTERN.matcher(rawMessage);
if (ts1Matcher.find()) {
tsString = ts1Matcher.group();
tsStart = ts1Matcher.start();
tsEnd = ts1Matcher.end();
usedFormatter = TIMESTAMP_1_FORMATTER;
} else {
Matcher ts2Matcher = TIMESTAMP_2_PATTERN.matcher(rawMessage);
if (ts2Matcher.find()) {
tsString = ts2Matcher.group();
tsStart = ts2Matcher.start();
tsEnd = ts2Matcher.end();
usedFormatter = TIMESTAMP_2_FORMATTER;
}
}

if (tsString != null) {
try {
// long timestampMillis = TimestampFormat.parseTimestamp(usedFormatter, tsString);
// arguments.add(new Timestamp(tsStart, tsEnd - tsStart, timestampMillis, "doesn't matter"));
arguments.add(new Timestamp(tsStart, tsEnd - tsStart, 1L, "doesn't matter"));
} catch (Exception e) {
throw new ParseException("Failed to parse timestamp: " + tsString, e);
}
}

// 2. Process the rest of the message for IP addresses and integers
String remaining = tsEnd >= 0 ? rawMessage.substring(tsEnd) : rawMessage;

// Find IP addresses
Matcher ipMatcher = IPV4_PATTERN.matcher(remaining);
while (ipMatcher.find()) {
String ipStr = ipMatcher.group();
int startPos = tsEnd + ipMatcher.start();
int length = ipMatcher.end() - ipMatcher.start();

// Only add if not overlapping with existing arguments
if (isOverlappingWithExistingArguments(arguments, startPos, length) == false) {
String[] octets = ipStr.split("\\.");
int[] octetValues = new int[4];
for (int j = 0; j < 4; j++) {
octetValues[j] = Integer.parseInt(octets[j]);
}
arguments.add(new IPv4Argument(startPos, length, octetValues, 0));
}
}

// Find integers
Matcher intMatcher = INTEGER_PATTERN.matcher(remaining);
while (intMatcher.find()) {
String intStr = intMatcher.group();
int startPos = tsEnd + intMatcher.start();
int length = intMatcher.end() - intMatcher.start();

// Only add if not overlapping with existing arguments
if (isOverlappingWithExistingArguments(arguments, startPos, length) == false) {
int value = Integer.parseInt(intStr);
arguments.add(new IntegerArgument(startPos, length, value));
}
}

return arguments;
}
}
}
5 changes: 5 additions & 0 deletions docs/changelog/132921.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 132921
summary: "WIP: Initial integration of the char parser"
area: Logs
type: feature
issues: []
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api;

import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType;

/**
* Represents a typed argument extracted from a text message.
* <p>
* An argument holds the original value and its encoding type, and can provide a string representation of the value.
*
* @param <T> the type of the argument's value
*/
public interface Argument<T> {
/**
* Returns the original value of the argument.
*
* @return the argument's value
*/
T value();

/**
* Returns the encoding type of the argument.
*
* @return the encoding type
*/
EncodingType type();

/**
* Returns the start position (first character) of the text that was used to extract this argument in the original text.
* @return the start position (inclusive)
*/
int startPosition();

/**
* Returns the length (number of characters) of the text that was used to extract this argument in the original text.
* @return the length
*/
int length();

/**
* Returns a string representation of the argument's value.
*
* @return the string representation of the value
*/
String encode();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api;

import java.util.Base64;

/**
* An abstract class for arguments that are encoded as a byte array.
* <p>
* This class provides a base implementation for arguments that are represented as a byte array.
* It handles the storage of the byte array and provides a Base64 encoder for the `encode()` method.
*/
public abstract class ByteEncodedArgument implements Argument<byte[]> {

protected final int textStartPosition;
protected final int textLength;

protected final byte[] encodedBytes;
protected final Base64.Encoder encoder = Base64.getEncoder().withoutPadding();

protected ByteEncodedArgument(int textStartPosition, int textLength, int numBytes) {
this.textStartPosition = textStartPosition;
this.textLength = textLength;
this.encodedBytes = new byte[numBytes];
}

@Override
public byte[] value() {
return encodedBytes;
}

@Override
public int startPosition() {
return textStartPosition;
}

@Override
public int length() {
return textLength;
}

@Override
public String encode() {
return encoder.encodeToString(encodedBytes);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api;

import org.elasticsearch.common.util.ByteUtils;
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType;

import java.util.Base64;

/**
* Represents a double argument extracted from a text message.
*/
public final class DoubleArgument implements Argument<Double> {
private final int startPosition;
private final int length;
private final double value;

// for encoding
private final byte[] doubleBytes = new byte[8];
private final Base64.Encoder encoder = Base64.getEncoder().withoutPadding();

public DoubleArgument(String s, int startPosition, int length) {
// todo - consider alternative for Double.parseDouble(String) that can work with CharSequence, the we can use SubstringView
this(startPosition, length, Double.parseDouble(s.substring(startPosition, startPosition + length)));
}

public DoubleArgument(int startPosition, int length, double value) {
this.startPosition = startPosition;
this.length = length;
this.value = value;
}

/**
* NOTE: this method is boxing the double value into a Double object.
* @return the value as a Double object
*/
@Override
public Double value() {
return value;
}

@Override
public EncodingType type() {
return EncodingType.DOUBLE;
}

@Override
public int startPosition() {
return startPosition;
}

@Override
public int length() {
return length;
}

@Override
public String encode() {
ByteUtils.writeDoubleLE(value, doubleBytes, 0);
return encoder.encodeToString(doubleBytes);
}
}
Loading