diff --git a/benchmarks/build.gradle b/benchmarks/build.gradle index 9439cca133fd9..affd5ee1eaccf 100644 --- a/benchmarks/build.gradle +++ b/benchmarks/build.gradle @@ -48,6 +48,7 @@ dependencies { api(project(':x-pack:plugin:esql')) api(project(':x-pack:plugin:esql:compute')) api(project(':x-pack:plugin:mapper-exponential-histogram')) + api(project(':x-pack:plugin:logsdb')) implementation project(path: ':libs:native') implementation project(path: ':libs:simdvec') implementation project(path: ':libs:exponential-histogram') diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/PatternedTextParserBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/PatternedTextParserBenchmark.java new file mode 100644 index 0000000000000..e3dcc44ee231f --- /dev/null +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/PatternedTextParserBenchmark.java @@ -0,0 +1,202 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.benchmark.index.mapper; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Argument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IPv4Argument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IntegerArgument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParseException; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParserFactory; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Timestamp; +import org.elasticsearch.xpack.logsdb.patterntext.PatternTextValueProcessor; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Run using the following command: ./gradlew -p benchmarks run --args 'PatternedTextParserBenchmark' + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(1) +public class PatternedTextParserBenchmark { + + private Parser parser; + private RegexParser regexParser; + private String testMessage; + @SuppressWarnings("FieldCanBeLocal") // used for measurement of timestamp parsing overhead + private DateTimeFormatter dateTimeFormatter; + + @Setup + public void setup() { + parser = ParserFactory.createParser(); + regexParser = new RegexParser(); + testMessage = "Oct 05, 2023 02:48:00 PM INFO Response from 127.0.0.1 took 2000 ms"; + dateTimeFormatter = DateTimeFormatter.ofPattern("MMM dd, yyyy hh:mm:ss a").withLocale(java.util.Locale.US); + } + + @Benchmark + public void parseWithCharParser(Blackhole blackhole) throws ParseException { + List> arguments = parser.parse(testMessage); + blackhole.consume(arguments); + } + + @Benchmark + public void parseWithRegexParser(Blackhole blackhole) throws ParseException { + List> arguments = regexParser.parse(testMessage); + blackhole.consume(arguments); + } + + @Benchmark + public void parseWithSimpleParser(Blackhole blackhole) throws ParseException { + PatternTextValueProcessor.Parts parts = PatternTextValueProcessor.split(testMessage); + blackhole.consume(parts); + // long timestamp = TimestampFormat.parseTimestamp(dateTimeFormatter, "Oct 05, 2023 02:48:00 PM"); + // blackhole.consume(timestamp); + } + + private static class RegexParser implements Parser { + + private static final Pattern IPV4_PATTERN = Pattern.compile("\\b(\\d{1,3}(?:\\.\\d{1,3}){3})\\b"); + private static final Pattern INTEGER_PATTERN = Pattern.compile("\\b\\d+\\b"); + + // New timestamp pattern and format + private static final Pattern TIMESTAMP_1_PATTERN = Pattern.compile( + "\\b\\d{2}/[A-Za-z]{3}/\\d{4}:\\d{2}:\\d{2}:\\d{2} [+-]\\d{4}\\b" + ); + private static final String TIMESTAMP_1_FORMAT = "dd/MMM/yyyy:HH:mm:ss Z"; + private static final DateTimeFormatter TIMESTAMP_1_FORMATTER = DateTimeFormatter.ofPattern(TIMESTAMP_1_FORMAT, Locale.ENGLISH); + + // Existing timestamp pattern and format + private static final Pattern TIMESTAMP_2_PATTERN = Pattern.compile( + "\\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \\d{2}, \\d{4} \\d{2}:\\d{2}:\\d{2} (?:AM|PM)\\b" + ); + private static final String TIMESTAMP_2_FORMAT = "MMM dd, yyyy hh:mm:ss a"; + private static final DateTimeFormatter TIMESTAMP_2_FORMATTER = DateTimeFormatter.ofPattern(TIMESTAMP_2_FORMAT, Locale.ENGLISH); + + /** + * Checks if a position range overlaps with any existing argument in the list + * @param arguments List of existing arguments + * @param startPos Start position of the range to check + * @param length Length of the range to check + * @return true if there is an overlap, false otherwise + */ + private boolean isOverlappingWithExistingArguments(List> arguments, int startPos, int length) { + int endPos = startPos + length; + for (Argument arg : arguments) { + int argStart = arg.startPosition(); + int argEnd = argStart + arg.length(); + + // Check if ranges overlap + if ((startPos <= argEnd) && (endPos >= argStart)) { + return true; + } + } + return false; + } + + @Override + public List> parse(String rawMessage) throws ParseException { + if (rawMessage == null || rawMessage.isEmpty()) { + throw new IllegalArgumentException("rawMessage cannot be null or empty"); + } + + List> arguments = new ArrayList<>(); + + // 1. Find and extract timestamp substring (prefer TIMESTAMP_1, then TIMESTAMP_2) + int tsStart = -1, tsEnd = -1; + String tsString = null; + DateTimeFormatter usedFormatter = null; + + Matcher ts1Matcher = TIMESTAMP_1_PATTERN.matcher(rawMessage); + if (ts1Matcher.find()) { + tsString = ts1Matcher.group(); + tsStart = ts1Matcher.start(); + tsEnd = ts1Matcher.end(); + usedFormatter = TIMESTAMP_1_FORMATTER; + } else { + Matcher ts2Matcher = TIMESTAMP_2_PATTERN.matcher(rawMessage); + if (ts2Matcher.find()) { + tsString = ts2Matcher.group(); + tsStart = ts2Matcher.start(); + tsEnd = ts2Matcher.end(); + usedFormatter = TIMESTAMP_2_FORMATTER; + } + } + + if (tsString != null) { + try { + // long timestampMillis = TimestampFormat.parseTimestamp(usedFormatter, tsString); + // arguments.add(new Timestamp(tsStart, tsEnd - tsStart, timestampMillis, "doesn't matter")); + arguments.add(new Timestamp(tsStart, tsEnd - tsStart, 1L, "doesn't matter")); + } catch (Exception e) { + throw new ParseException("Failed to parse timestamp: " + tsString, e); + } + } + + // 2. Process the rest of the message for IP addresses and integers + String remaining = tsEnd >= 0 ? rawMessage.substring(tsEnd) : rawMessage; + + // Find IP addresses + Matcher ipMatcher = IPV4_PATTERN.matcher(remaining); + while (ipMatcher.find()) { + String ipStr = ipMatcher.group(); + int startPos = tsEnd + ipMatcher.start(); + int length = ipMatcher.end() - ipMatcher.start(); + + // Only add if not overlapping with existing arguments + if (isOverlappingWithExistingArguments(arguments, startPos, length) == false) { + String[] octets = ipStr.split("\\."); + int[] octetValues = new int[4]; + for (int j = 0; j < 4; j++) { + octetValues[j] = Integer.parseInt(octets[j]); + } + arguments.add(new IPv4Argument(startPos, length, octetValues, 0)); + } + } + + // Find integers + Matcher intMatcher = INTEGER_PATTERN.matcher(remaining); + while (intMatcher.find()) { + String intStr = intMatcher.group(); + int startPos = tsEnd + intMatcher.start(); + int length = intMatcher.end() - intMatcher.start(); + + // Only add if not overlapping with existing arguments + if (isOverlappingWithExistingArguments(arguments, startPos, length) == false) { + int value = Integer.parseInt(intStr); + arguments.add(new IntegerArgument(startPos, length, value)); + } + } + + return arguments; + } + } +} diff --git a/docs/changelog/132921.yaml b/docs/changelog/132921.yaml new file mode 100644 index 0000000000000..f2c4c68427a7f --- /dev/null +++ b/docs/changelog/132921.yaml @@ -0,0 +1,5 @@ +pr: 132921 +summary: "WIP: Initial integration of the char parser" +area: Logs +type: feature +issues: [] diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Argument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Argument.java new file mode 100644 index 0000000000000..b4314ca668916 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Argument.java @@ -0,0 +1,52 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +/** + * Represents a typed argument extracted from a text message. + *

+ * An argument holds the original value and its encoding type, and can provide a string representation of the value. + * + * @param the type of the argument's value + */ +public interface Argument { + /** + * Returns the original value of the argument. + * + * @return the argument's value + */ + T value(); + + /** + * Returns the encoding type of the argument. + * + * @return the encoding type + */ + EncodingType type(); + + /** + * Returns the start position (first character) of the text that was used to extract this argument in the original text. + * @return the start position (inclusive) + */ + int startPosition(); + + /** + * Returns the length (number of characters) of the text that was used to extract this argument in the original text. + * @return the length + */ + int length(); + + /** + * Returns a string representation of the argument's value. + * + * @return the string representation of the value + */ + String encode(); +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ByteEncodedArgument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ByteEncodedArgument.java new file mode 100644 index 0000000000000..e255736fda2d0 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ByteEncodedArgument.java @@ -0,0 +1,51 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import java.util.Base64; + +/** + * An abstract class for arguments that are encoded as a byte array. + *

+ * This class provides a base implementation for arguments that are represented as a byte array. + * It handles the storage of the byte array and provides a Base64 encoder for the `encode()` method. + */ +public abstract class ByteEncodedArgument implements Argument { + + protected final int textStartPosition; + protected final int textLength; + + protected final byte[] encodedBytes; + protected final Base64.Encoder encoder = Base64.getEncoder().withoutPadding(); + + protected ByteEncodedArgument(int textStartPosition, int textLength, int numBytes) { + this.textStartPosition = textStartPosition; + this.textLength = textLength; + this.encodedBytes = new byte[numBytes]; + } + + @Override + public byte[] value() { + return encodedBytes; + } + + @Override + public int startPosition() { + return textStartPosition; + } + + @Override + public int length() { + return textLength; + } + + @Override + public String encode() { + return encoder.encodeToString(encodedBytes); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/DoubleArgument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/DoubleArgument.java new file mode 100644 index 0000000000000..e191771287a7b --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/DoubleArgument.java @@ -0,0 +1,67 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.common.util.ByteUtils; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +import java.util.Base64; + +/** + * Represents a double argument extracted from a text message. + */ +public final class DoubleArgument implements Argument { + private final int startPosition; + private final int length; + private final double value; + + // for encoding + private final byte[] doubleBytes = new byte[8]; + private final Base64.Encoder encoder = Base64.getEncoder().withoutPadding(); + + public DoubleArgument(String s, int startPosition, int length) { + // todo - consider alternative for Double.parseDouble(String) that can work with CharSequence, the we can use SubstringView + this(startPosition, length, Double.parseDouble(s.substring(startPosition, startPosition + length))); + } + + public DoubleArgument(int startPosition, int length, double value) { + this.startPosition = startPosition; + this.length = length; + this.value = value; + } + + /** + * NOTE: this method is boxing the double value into a Double object. + * @return the value as a Double object + */ + @Override + public Double value() { + return value; + } + + @Override + public EncodingType type() { + return EncodingType.DOUBLE; + } + + @Override + public int startPosition() { + return startPosition; + } + + @Override + public int length() { + return length; + } + + @Override + public String encode() { + ByteUtils.writeDoubleLE(value, doubleBytes, 0); + return encoder.encodeToString(doubleBytes); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/HexadecimalArgument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/HexadecimalArgument.java new file mode 100644 index 0000000000000..ada1cb12554b8 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/HexadecimalArgument.java @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +/** + * Represents a hexadecimal argument extracted from a text message. + *

+ * The value is a byte array decoded from a hexadecimal string. + */ +public final class HexadecimalArgument extends ByteEncodedArgument { + + public HexadecimalArgument(String s, int startPosition, int length) { + super(startPosition, length, (length + 1) / 2); + int endIndex = startPosition + length; + for (int i = startPosition, j = 0; i < endIndex; i += 2, j++) { + int high = Character.digit(s.charAt(i), 16); + if (i + 1 < endIndex) { + int low = Character.digit(s.charAt(i + 1), 16); + encodedBytes[j] = (byte) ((high << 4) | low); + } else { + // this is the last nibble for an odd-length string. + // it should be treated as the low nibble of the last byte. + encodedBytes[j] = (byte) high; + } + } + } + + @Override + public EncodingType type() { + return EncodingType.HEX; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IPv4AddressArgument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IPv4AddressArgument.java new file mode 100644 index 0000000000000..e58f11353f838 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IPv4AddressArgument.java @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +/** + * todo - probably makes sense to encode in two separate byte arrays - one for IP and one for port + * + * Represents an IPv4 address with port argument extracted from a text message. + *

+ * The value is a byte array containing the four octets of the IPv4 address followed by + * two bytes for the port number (little-endian encoding). + * Format: [octet0, octet1, octet2, octet3, port_low_byte, port_high_byte] + */ +public final class IPv4AddressArgument extends ByteEncodedArgument { + + private final int port; + + public IPv4AddressArgument(int startPosition, int length, int[] octets, int firstOctetIndex, int port) { + super(startPosition, length, 6); + + for (int i = firstOctetIndex; i < 4; i++) { + encodedBytes[i] = (byte) octets[i]; + } + this.port = port; + + // Encode port in little-endian format (2 bytes) + encodedBytes[4] = (byte) (port & 0xFF); // low byte + encodedBytes[5] = (byte) ((port >> 8) & 0xFF); // high byte + } + + @Override + public EncodingType type() { + return EncodingType.IPV4_ADDRESS; + } + + public int getPort() { + return port; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IPv4Argument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IPv4Argument.java new file mode 100644 index 0000000000000..b34e2915669e5 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IPv4Argument.java @@ -0,0 +1,33 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +/** + * Represents an IPv4 address argument extracted from a text message. + *

+ * The value is a byte array of the four octets of the IPv4 address. + */ +public final class IPv4Argument extends ByteEncodedArgument { + + public IPv4Argument(int startPosition, int length, int[] octets, int firstOctetIndex) { + super(startPosition, length, 4); + for (int i = firstOctetIndex; i < 4; i++) { + if (octets[i] < 0 || octets[i] > 255) { + throw new IllegalArgumentException("Each octet of an IPv4 address must be between 0 and 255."); + } + encodedBytes[i] = (byte) octets[i]; + } + } + + @Override + public EncodingType type() { + return EncodingType.IPV4; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IntegerArgument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IntegerArgument.java new file mode 100644 index 0000000000000..7f3bd612b9660 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/IntegerArgument.java @@ -0,0 +1,72 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.common.util.ByteUtils; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +import java.util.Base64; + +/** + * Represents an integer argument extracted from a text message. + */ +public final class IntegerArgument implements Argument { + private final int startPosition; + private final int length; + private final int value; + private final Sign sign; + + // for encoding + private final byte[] integerBytes = new byte[4]; + private final Base64.Encoder encoder = Base64.getEncoder().withoutPadding(); + + public IntegerArgument(int startPosition, int length, int value) { + this(startPosition, length, value, null); + } + + public IntegerArgument(int startPosition, int length, int value, Sign sign) { + this.startPosition = startPosition; + this.length = length; + this.value = value; + this.sign = sign; + } + + /** + * NOTE: this method is boxing the int value into a Integer object. + * @return the value as an Integer object + */ + @Override + public Integer value() { + return value; + } + + @Override + public EncodingType type() { + return EncodingType.INTEGER; + } + + @Override + public int startPosition() { + return startPosition; + } + + @Override + public int length() { + return length; + } + + @Override + public String encode() { + ByteUtils.writeIntLE(value, integerBytes, 0); + return encoder.encodeToString(integerBytes); + } + + public Sign sign() { + return sign; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/KeywordArgument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/KeywordArgument.java new file mode 100644 index 0000000000000..d1838585e26b0 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/KeywordArgument.java @@ -0,0 +1,58 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +/** + * Represents a keyword argument extracted from a text message. + *

+ * A keyword is different from a simple text token in that it describes a token that is encoded as a string, + * but it represents a message argument and not a static token. + * Ideally, only arguments with low cardinality should be represented by a Keyword. + * High cardinality ones (like UUIDs for example) should be represented by a different type, as much as possible. + * Since we rely on a generic schema for the identification of arguments, we take into account that it would be used + * for high cardinality arguments as well. + */ +public final class KeywordArgument implements Argument { + private final int startPosition; + private final int length; + private final StringBuilder value; + + public KeywordArgument(String s, int start, int length) { + this.startPosition = start; + this.length = length; + this.value = new StringBuilder(length); + this.value.append(s, start, start + length); + } + + @Override + public String value() { + return value.toString(); + } + + @Override + public EncodingType type() { + return EncodingType.TEXT; + } + + @Override + public int startPosition() { + return startPosition; + } + + @Override + public int length() { + return length; + } + + @Override + public String encode() { + return value.toString(); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParseException.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParseException.java new file mode 100644 index 0000000000000..9c6b20ed679a1 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParseException.java @@ -0,0 +1,22 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +/** + * Exception thrown when a parsing operation fails. + */ +public class ParseException extends Exception { + + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Parser.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Parser.java new file mode 100644 index 0000000000000..7fb56c9b4044c --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Parser.java @@ -0,0 +1,69 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import java.util.List; + +/** + * Interface for parsing raw text messages into structured patterns with typed arguments. + * + *

Implementations of this interface are responsible for analyzing input text, identifying static parts and extracting + * dynamic parts into arguments (like timestamps, numbers, hexadecimal etc.). + * + *

The parser operates by recognizing tokens and sub-tokens within the input text, matching them against configured patterns, and + * producing an ordered list of typed arguments, where the details about each argument include: + *

    + *
  • the type of the argument
  • + *
  • the extracted value (e.g. number for numeric arguments, millis since epoch for timestamps etc.)
  • + *
  • the start and end position of the text that was used for argument extraction within the input message
  • + *
+ */ +public interface Parser { + + char PLACEHOLDER_PREFIX = '%'; + + /** + * Parses a raw text message and extracts an ordered list of typed arguments. The first argument of type {@link Timestamp} is THE + * timestamp of the message. + * + * @param rawMessage the raw text message to parse + * @return an ordered list of typed arguments extracted from the message, including start and end positions within the original text + * @throws ParseException if the message cannot be parsed + */ + List> parse(String rawMessage) throws ParseException; + + /** + * Constructs a pattern string from the raw message and the list of extracted arguments. + * @param rawMessage the original raw message + * @param arguments the list of extracted arguments + * @param patternedMessage a StringBuilder to append the constructed pattern to + * @param putPlaceholders if true, placeholders will be used for arguments; if false, the argument parts will be omitted from the + * pattern + */ + static void constructPattern(String rawMessage, List> arguments, StringBuilder patternedMessage, boolean putPlaceholders) { + patternedMessage.setLength(0); + int currentIndex = 0; + for (Argument argument : arguments) { + int argStart = argument.startPosition(); + int argEnd = argStart + argument.length(); + // Append the static part before the argument + if (currentIndex < argStart) { + patternedMessage.append(rawMessage, currentIndex, argStart); + } + // Append the argument placeholder or skip it + if (putPlaceholders) { + patternedMessage.append(PLACEHOLDER_PREFIX).append(argument.type().getSymbol()); + } + currentIndex = argEnd; + } + // Append any remaining static part after the last argument + if (currentIndex < rawMessage.length()) { + patternedMessage.append(rawMessage, currentIndex, rawMessage.length()); + } + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParserFactory.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParserFactory.java new file mode 100644 index 0000000000000..594e24550a137 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParserFactory.java @@ -0,0 +1,37 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler.CompiledSchema; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler.SchemaCompiler; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.CharParser; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.Schema; + +/** + * Factory for creating {@link Parser} instances with a pre-compiled schema. + * + *

All parser instances share the same compiled schema for efficiency. + * This factory is thread-safe. + */ +public class ParserFactory { + + private static final CompiledSchema compiledSchema = SchemaCompiler.compile(Schema.getInstance()); + + private ParserFactory() { + // Prevent instantiation + } + + /** + * Creates a new parser instance. + * + * @return a new {@link Parser} instance + */ + public static Parser createParser() { + return new CharParser(compiledSchema); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Sign.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Sign.java new file mode 100644 index 0000000000000..1a4e39b0651cf --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Sign.java @@ -0,0 +1,13 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +public enum Sign { + PLUS, + MINUS +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Timestamp.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Timestamp.java new file mode 100644 index 0000000000000..94c0c831a8893 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/Timestamp.java @@ -0,0 +1,80 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.common.util.ByteUtils; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +import java.util.Base64; + +/** + * Represents a timestamp extracted from a text message. + *

+ * The value is a long representing the number of milliseconds since the epoch. + * It also holds the format of the timestamp as a string. + */ +public final class Timestamp implements Argument { + private final int startPosition; + private final int length; + private final long timestampMillis; + private final String format; + + // for encoding + private final byte[] millisBytes = new byte[8]; + private final Base64.Encoder encoder = Base64.getEncoder().withoutPadding(); + + public Timestamp(int startPosition, int length, long timestampMillis, String format) { + this.startPosition = startPosition; + this.length = length; + this.timestampMillis = timestampMillis; + this.format = format; + } + + public long getTimestampMillis() { + return timestampMillis; + } + + public String getFormat() { + return format; + } + + /** + * NOTE: this method is boxing the long value into a Long object. + * @return the timestamp as a Long object + */ + @Override + public Long value() { + return timestampMillis; + } + + @Override + public EncodingType type() { + return EncodingType.TIMESTAMP; + } + + @Override + public int startPosition() { + return startPosition; + } + + @Override + public int length() { + return length; + } + + @Override + public String encode() { + ByteUtils.writeLongLE(timestampMillis, millisBytes, 0); + return encoder.encodeToString(millisBytes); + } + + @Override + public String toString() { + return "Timestamp{" + "timestamp=" + timestampMillis + ", format='" + format + '\'' + '}'; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/UUIDArgument.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/UUIDArgument.java new file mode 100644 index 0000000000000..63c1100893a9e --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/UUIDArgument.java @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.common.util.ByteUtils; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +import java.util.UUID; + +/** + * Represents a UUID argument extracted from a text message. + *

+ * The value is a byte array of the 16 bytes of the UUID. + */ +public final class UUIDArgument extends ByteEncodedArgument { + + public UUIDArgument(String s, int startPosition, int length) { + super(startPosition, length, 16); + if (length == 36) { + // UUID in standard format (e.g., "123e4567-e89b-12d3-a456-426614174000") + UUID uuid = UUID.fromString(s.substring(startPosition, startPosition + length)); + ByteUtils.writeLongLE(uuid.getMostSignificantBits(), encodedBytes, 0); + ByteUtils.writeLongLE(uuid.getLeastSignificantBits(), encodedBytes, 8); + } else if (length == 32) { + // UUID in compact format (e.g., "123e4567e89b12d3a456426614174000") + // todo - handle this case + } + } + + @Override + public EncodingType type() { + return EncodingType.UUID; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/CharCodes.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/CharCodes.java new file mode 100644 index 0000000000000..2b566b4ad2617 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/CharCodes.java @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.common; + +public class CharCodes { + // an optimization to store byte codes for different character types instead of enum references, to make it more cache-friendly + public static final byte DIGIT_CHAR_CODE = 0; + public static final byte ALPHABETIC_CHAR_CODE = 1; + public static final byte SUBTOKEN_DELIMITER_CHAR_CODE = 2; + public static final byte TOKEN_DELIMITER_CHAR_CODE = 3; + public static final byte TOKEN_BOUNDARY_CHAR_CODE = 4; + public static final byte LINE_END_CODE = 5; + public static final byte OTHER_CHAR_CODE = 6; +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/EncodingType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/EncodingType.java new file mode 100644 index 0000000000000..368f6843a5abc --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/EncodingType.java @@ -0,0 +1,44 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.common; + +public enum EncodingType { + TEXT('A', "text"), + INTEGER('I', "integer"), + DOUBLE('F', "double"), + HEX('H', "hexadecimal"), + IPV4('4', "IPv4"), + IPV4_ADDRESS('V', "IPv4 and port"), + UUID('U', "UUID"), + TIMESTAMP('T', "timestamp"); + + private final char symbol; + private final String description; + + EncodingType(char symbol, String description) { + this.symbol = symbol; + this.description = description; + } + + public char getSymbol() { + return symbol; + } + + public String getDescription() { + return description; + } + + public static EncodingType fromSymbol(char symbol) { + for (EncodingType type : values()) { + if (type.symbol == symbol) { + return type; + } + } + throw new IllegalArgumentException("Unknown token encoding type symbol: " + symbol); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/OperatorType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/OperatorType.java new file mode 100644 index 0000000000000..cb4f40ac4f994 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/OperatorType.java @@ -0,0 +1,43 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.common; + +public enum OperatorType { + EQUALITY("=="), + LESS_THAN("<"), + GREATER_THAN(">"), + LESS_THAN_OR_EQUAL("<="), + GREATER_THAN_OR_EQUAL(">="), + NOT_EQUAL("!="), + RANGE("-"), + SET("|"), + MAP("=|"), + LENGTH("{}"), + AND("&&"), + OR("||"), + ANY("*"); + + private final String symbol; + + OperatorType(String symbol) { + this.symbol = symbol; + } + + public String getSymbol() { + return symbol; + } + + public static OperatorType fromSymbol(String symbol) { + for (OperatorType op : values()) { + if (op.symbol.equals(symbol)) { + return op; + } + } + throw new IllegalArgumentException("Unknown operator symbol: " + symbol); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/TimestampComponentType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/TimestampComponentType.java new file mode 100644 index 0000000000000..311257d43aeeb --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/TimestampComponentType.java @@ -0,0 +1,88 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.common; + +public enum TimestampComponentType { + YEAR("Y"), + MONTH("M"), + DAY("D"), + HOUR("h"), + AM_PM("AP"), + MINUTE("m"), + SECOND("s"), + MILLISECOND("ms"), + MICROSECOND("us"), + NANOSECOND("ns"), + TIMEZONE_OFFSET_HOURS("TZh"), + TIMEZONE_OFFSET_MINUTES("TZm"), + TIMEZONE_OFFSET_HOURS_AND_MINUTES("TZhm"), + NA("NA"); + + // AM/PM indicator codes + public static final int NO_AM_PM_CODE = 0; + public static final int AM_CODE = 1; + public static final int PM_CODE = 2; + + // enum instance codes (derived from ordinal) + public static final int YEAR_CODE; + public static final int MONTH_CODE; + public static final int DAY_CODE; + public static final int HOUR_CODE; + public static final int AM_PM_CODE; + public static final int MINUTE_CODE; + public static final int SECOND_CODE; + public static final int MILLISECOND_CODE; + public static final int MICROSECOND_CODE; + public static final int NANOSECOND_CODE; + public static final int TIMEZONE_OFFSET_HOURS_CODE; + public static final int TIMEZONE_OFFSET_MINUTES_CODE; + public static final int TIMEZONE_OFFSET_HOURS_AND_MINUTES_CODE; + public static final int NA_CODE; + + static { + YEAR_CODE = YEAR.ordinal(); + MONTH_CODE = MONTH.ordinal(); + DAY_CODE = DAY.ordinal(); + HOUR_CODE = HOUR.ordinal(); + AM_PM_CODE = AM_PM.ordinal(); + MINUTE_CODE = MINUTE.ordinal(); + SECOND_CODE = SECOND.ordinal(); + MILLISECOND_CODE = MILLISECOND.ordinal(); + MICROSECOND_CODE = MICROSECOND.ordinal(); + NANOSECOND_CODE = NANOSECOND.ordinal(); + TIMEZONE_OFFSET_HOURS_CODE = TIMEZONE_OFFSET_HOURS.ordinal(); + TIMEZONE_OFFSET_MINUTES_CODE = TIMEZONE_OFFSET_MINUTES.ordinal(); + TIMEZONE_OFFSET_HOURS_AND_MINUTES_CODE = TIMEZONE_OFFSET_HOURS_AND_MINUTES.ordinal(); + NA_CODE = NA.ordinal(); + } + + private final int code; + private final String symbol; + + TimestampComponentType(String symbol) { + this.code = this.ordinal(); + this.symbol = symbol; + } + + public int getCode() { + return code; + } + + public String getSymbol() { + return symbol; + } + + public static TimestampComponentType fromSymbol(String symbol) { + for (TimestampComponentType tc : values()) { + if (tc.symbol.equals(symbol)) { + return tc; + } + } + throw new IllegalArgumentException("Unknown timestamp component symbol: " + symbol); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/Type.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/Type.java new file mode 100644 index 0000000000000..5bc892f7155d4 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/Type.java @@ -0,0 +1,14 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.common; + +public interface Type { + String name(); + + EncodingType encodingType(); +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/package-info.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/package-info.java new file mode 100644 index 0000000000000..21e6350eb188a --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/common/package-info.java @@ -0,0 +1,11 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * A package containing utilities and classes that are common to schema representation, compilation, and parsing. + */ +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.common; diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/CompiledSchema.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/CompiledSchema.java new file mode 100644 index 0000000000000..9a9fff0f4e43c --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/CompiledSchema.java @@ -0,0 +1,206 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.BitmaskRegistry; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.CharSpecificParsingInfo; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.MultiTokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubTokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringToIntegerMap; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.TokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.Schema; + +/** + * Holds the compiled form of the schema.yaml file contents for the parser. See {@link Schema} for more details. + * The compiled form is essentially a set of fast-access lookup tables that allow the parser to quickly update the parsing state of the + * currently parsed subToken, token, and multi-token. States are represented through bitmasks. During schema compilation, each subToken, + * token, and multi-token is assigned a unique bit in a bitmask. During parsing, the subToken bitmask represents a superset of all + * subToken bits that correspond subToken-types that are still valid for the current subToken being parsed (meaning - all such that + * have not been eliminated yet). Similarly, the current token bitmask represents all valid tokens for the current token being parsed, + * and the current multi-token bitmask represents all valid multi-tokens for the recent tokens. + */ +@SuppressWarnings("ClassCanBeRecord") +public final class CompiledSchema { + /** + * A fast-access lookup table for finding the subToken bitmask based on the character. + * This array contains bitmask mappings for only and all ASCII characters. + */ + public final int[] charToSubTokenBitmask; + + /** + * A fast-access lookup table for finding the character type based on the character. + * This array contains type mappings for only and all ASCII characters. + */ + public final byte[] charToCharType; + + /** + * A fast-access lookup table for finding parsing information for delimiter characters. + * This array contains instances of {@link CharSpecificParsingInfo} for delimiter characters, where the index in the array + * corresponds to the ASCII code of the character. + * This means that the array is sparse and only contains information for characters that are defined as delimiters in the schema. + */ + public final CharSpecificParsingInfo[] charSpecificParsingInfos; + + /** + * A fast-access map for retrieving the numeric value representation for String subTokens. + */ + public final SubstringToIntegerMap subTokenNumericValueRepresentation; + + /** + * The maximum number of subTokens that can be parsed from a single token. + */ + public final int maxSubTokensPerToken; + + /** + * The maximum number of tokens that can be parsed from a single multi-token. + */ + public final int maxTokensPerMultiToken; + + /** + * A bitmask with only the generic integer subToken bit set. + */ + public final int intSubTokenBitmask; + + /** + * A bitmask that represents all subTokens types that are of an integer type. + */ + public final int allIntegerSubTokenBitmask; + + /** + * A bitmask that represents all generic subToken types, such as integer, hexadecimal, keyword, etc. + */ + public final int genericSubTokenTypesBitmask; + + /** + * A fast-access bitmask lookup table for integer subTokens. This requires a two-step lookup. + * See {@link #integerSubTokenBitmaskArrayRanges} for details. + */ + public final int[] integerSubTokenBitmasks; + + /** + * An auxiliary array that is used to determine the proper bitmask index within the {@link #integerSubTokenBitmasks} array for each + * integer value. + * Entries in this array contain the upper bounds (inclusive) of ranges of integers that share the same bitmask. The lookup in this + * array is done using binary search. Once the upper bound is found, its index within this array indicates the proper bitmask index + * within the {@link #integerSubTokenBitmasks} array. The last entry in this array is always {@link Integer#MAX_VALUE}. + *

+ * For example, consider this array: [0, 10, {@link Integer#MAX_VALUE}]. The first entry (0) indicates that the first bitmask in + * {@link #integerSubTokenBitmasks} is valid for all integers from {@link Integer#MIN_VALUE} to 0 (inclusive). + * The second entry (10) indicates that the second bitmask in {@link #integerSubTokenBitmasks} is valid for all integers from 1 to 10 + * (inclusive). The last entry (Integer.MAX_VALUE) indicates that the third bitmask in {@link #integerSubTokenBitmasks} is valid for all + * integers from 11 to {@link Integer#MAX_VALUE} (inclusive). + * So, when looking for the matching bitmask for the integer value of 5, we would determine that we need to use the second bitmask from + * the {@link #integerSubTokenBitmasks} array. + *

+ */ + public final int[] integerSubTokenBitmaskArrayRanges; + + /** + * A fast-access bitmask lookup table for small integers. This provides an optimization for small integers, that are more common, + * over the two-step lookup (and a binary search) that is required for using {@link #integerSubTokenBitmasks}. + */ + public final int[] smallIntegerSubTokenBitmasks; + + /** + * A fast-access token bitmask lookup table for each number of subTokens. For example, the bitmask at index 3 indicates the token + * types that have 4 subTokens. + */ + public final int[] subTokenCountToTokenBitmask; + + /** + * A fast-access multi-token bitmask lookup table for each number of tokens. For example, the bitmask at index 2 indicates the + * multi-token types that have 3 tokens. + */ + public final int[] tokenCountToMultiTokenBitmask; + + /** + * A fast-access multi-token bitmask lookup table for each number of subTokens. For example, the bitmask at index 5 indicates the + * multi-token types that have 6 subTokens (regardless of the number of tokens). + */ + public final int[] subTokenCountToMultiTokenBitmask; + + /** + * A fast-access multi-token bitmask lookup table for each total length of delimiter parts. For example, the bitmask at index 10 + * indicates the multi-token types that have delimiter parts with a total length of 10 characters. Delimiter parts are the literal + * strings between tokens in a multi-token format. + */ + public final int[] delimiterPartsTotalLengthToMultiTokenBitmask; + + /** + * A subToken bitmask registry that allows for fast access to subToken types by their bit index or bitmask. + */ + public final BitmaskRegistry subTokenBitmaskRegistry; + + /** + * A token bitmask registry that allows for fast access to token types by their bit index or bitmask. + */ + public final BitmaskRegistry tokenBitmaskRegistry; + + /** + * The maximum number of subTokens that can be parsed from a single multi-token. + */ + public final int maxSubTokensPerMultiToken; + + /** + * A multi-token bitmask registry that allows for fast access to multi-token types by their bit index or bitmask. + */ + public final BitmaskRegistry multiTokenBitmaskRegistry; + + public CompiledSchema( + int[] charToSubTokenBitmask, + byte[] charToCharType, + CharSpecificParsingInfo[] charSpecificParsingInfos, + SubstringToIntegerMap subTokenNumericValueRepresentation, + int maxSubTokensPerToken, + int maxTokensPerMultiToken, + int maxSubTokensPerMultiToken, + int intSubTokenBitmask, + int allIntegerSubTokenBitmask, + int genericSubTokenTypesBitmask, + int[] integerSubTokenBitmasks, + int[] integerSubTokenBitmaskArrayRanges, + int[] smallIntegerSubTokenBitmasks, + int[] subTokenCountToTokenBitmask, + int[] tokenCountToMultiTokenBitmask, + int[] subTokenCountToMultiTokenBitmask, + int[] delimiterPartsTotalLengthToMultiTokenBitmask, + BitmaskRegistry subTokenBitmaskRegistry, + BitmaskRegistry tokenBitmaskRegistry, + BitmaskRegistry multiTokenBitmaskRegistry + ) { + this.charToSubTokenBitmask = charToSubTokenBitmask; + this.charToCharType = charToCharType; + this.charSpecificParsingInfos = charSpecificParsingInfos; + this.subTokenNumericValueRepresentation = subTokenNumericValueRepresentation; + this.maxSubTokensPerToken = maxSubTokensPerToken; + this.maxTokensPerMultiToken = maxTokensPerMultiToken; + this.maxSubTokensPerMultiToken = maxSubTokensPerMultiToken; + this.intSubTokenBitmask = intSubTokenBitmask; + this.allIntegerSubTokenBitmask = allIntegerSubTokenBitmask; + this.genericSubTokenTypesBitmask = genericSubTokenTypesBitmask; + this.integerSubTokenBitmasks = integerSubTokenBitmasks; + this.integerSubTokenBitmaskArrayRanges = integerSubTokenBitmaskArrayRanges; + this.smallIntegerSubTokenBitmasks = smallIntegerSubTokenBitmasks; + this.subTokenCountToTokenBitmask = subTokenCountToTokenBitmask; + this.tokenCountToMultiTokenBitmask = tokenCountToMultiTokenBitmask; + this.subTokenCountToMultiTokenBitmask = subTokenCountToMultiTokenBitmask; + this.delimiterPartsTotalLengthToMultiTokenBitmask = delimiterPartsTotalLengthToMultiTokenBitmask; + if (subTokenBitmaskRegistry.isSealed() == false) { + throw new IllegalArgumentException("SubToken bitmask registry must be sealed before passing to the compiled schema"); + } + this.subTokenBitmaskRegistry = subTokenBitmaskRegistry; + if (tokenBitmaskRegistry.isSealed() == false) { + throw new IllegalArgumentException("Token bitmask registry must be sealed before passing to the compiled schema"); + } + this.tokenBitmaskRegistry = tokenBitmaskRegistry; + if (multiTokenBitmaskRegistry.isSealed() == false) { + throw new IllegalArgumentException("Multi-token bitmask registry must be sealed before passing to the compiled schema"); + } + this.multiTokenBitmaskRegistry = multiTokenBitmaskRegistry; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SchemaCompiler.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SchemaCompiler.java new file mode 100644 index 0000000000000..d9881182ed340 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SchemaCompiler.java @@ -0,0 +1,955 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.TimestampComponentType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.BitmaskRegistry; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.CharSpecificParsingInfo; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.MultiTokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubTokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringToIntegerMap; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringView; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.TimestampFormat; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.TokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.MultiTokenFormat; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.Schema; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenBaseType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenFormat; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.IntConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.IntConstraints; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringToIntMapConstraint; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Supplier; +import java.util.function.ToIntFunction; + +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.ALPHABETIC_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.DIGIT_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.OTHER_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.SUBTOKEN_DELIMITER_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.TOKEN_BOUNDARY_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.TOKEN_DELIMITER_CHAR_CODE; + +public class SchemaCompiler { + + public static final int ASCII_RANGE = 128; + public static final int SMALL_INTEGERS_MAX_VALUE = 100; + public static final String INTEGER_SUBTOKEN_NAME = "integer"; + public static final String DOUBLE_SUBTOKEN_NAME = "double"; + public static final String HEX_SUBTOKEN_NAME = "hex"; + + // todo - try to break this method into smaller methods + public static CompiledSchema compile(Schema schema) { + + byte[] charToCharType = new byte[ASCII_RANGE]; + for (int i = 0; i < ASCII_RANGE; i++) { + charToCharType[i] = getCharCode((char) i, schema); + } + + int[] charToSubTokenBitmask = new int[ASCII_RANGE]; + + // for each delimiter char, we store the superset of token types that can be valid for this delimiter at each subToken index + Map> delimiterCharToTokenBitmaskPerSubTokenIndex = new HashMap<>(); + // for each token format, the last subToken is not identified by a subToken delimiter, but rather by a token delimiter + ArrayList tokenBitmaskForLastSubToken = new ArrayList<>(); + // for each delimiter char, we store a list of functions that can generate a token bitmask based on the string value of the + // subToken, the subToken index, and the delimiter char + Map> delimiterCharToBitmaskGeneratorPerSubTokenIndex = new HashMap<>(); + // for each token format, the last subToken is not identified by a subToken delimiter, but rather by a token delimiter + ArrayList bitmaskGeneratorForLastSubToken = new ArrayList<>(); + // a global map for all string subToken types, that maps a string value to the corresponding subToken bitmask + SubstringToIntegerMap.Builder subTokenValueToBitmaskMapBuilder = SubstringToIntegerMap.builder(); + // for each token boundary character (either token delimiters or boundary characters), we store the superset of multi-token types + // that are valid for this character at each index within the total concatenated delimiter parts of all multi-token formats. + // For example, given the multi-token format "$Mon, $DD $YYYY $timeS $AP", the full concatenated string made up of the delimiter + // parts is ", ". Therefore, the bit of this multi-token will be set at index 0 for the character ',' and at indices + // 1, 2, 3, and 4 for the space character. + Map> tokenBoundaryCharToMultiTokenBitmaskPerIndex = new HashMap<>(); + + int allSubTokenBitmask = 0; + int intSubTokenBitmask; + int allIntegerSubTokenBitmask = 0; + int genericSubTokenTypesBitmask = 0; + + Map subTokenCountToTokenBitmaskMap = new HashMap<>(); + Map tokenCountToMultiTokenBitmaskMap = new HashMap<>(); + Map subTokenCountToMultiTokenBitmaskMap = new HashMap<>(); + + int maxTokensPerMultiToken = 0; + int maxSubTokensPerMultiToken = 0; + int maxDelimiterPartsLength = 0; + + Map> tokenTypeToMultiTokenBitmaskByPosition = new HashMap<>(); + BitmaskRegistry multiTokenBitmaskRegistry = new BitmaskRegistry<>(); + ArrayList multiTokenBitmaskPerDelimiterPartsLengths = new ArrayList<>(); + for (org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.MultiTokenType multiTokenType : schema.getMultiTokenTypes()) { + MultiTokenFormat format = multiTokenType.getFormat(); + List tokens = format.getTokens(); + + TimestampFormat timestampFormat = null; + if (multiTokenType.encodingType() == EncodingType.TIMESTAMP) { + timestampFormat = createTimestampFormat(format); + } + + int subTokenCount = multiTokenType.getNumberOfSubTokens(); + int multiTokenBitmask = multiTokenBitmaskRegistry.register( + new MultiTokenType(multiTokenType.name(), multiTokenType.encodingType(), subTokenCount, timestampFormat) + ); + + maxTokensPerMultiToken = Math.max(maxTokensPerMultiToken, tokens.size()); + maxSubTokensPerMultiToken = Math.max(maxSubTokensPerMultiToken, subTokenCount); + + updateBitmaskToCount(tokenCountToMultiTokenBitmaskMap, tokens.size(), multiTokenBitmask); + updateBitmaskToCount(subTokenCountToMultiTokenBitmaskMap, subTokenCount, multiTokenBitmask); + + for (int i = 0; i < tokens.size(); i++) { + String tokenName = tokens.get(i).name(); + updateBitmaskByPosition(tokenTypeToMultiTokenBitmaskByPosition, tokenName, i, multiTokenBitmask); + } + + int delimiterCharPosition = -1; + for (String delimiterPart : format.getDelimiterParts()) { + for (char tokenBoundaryCharacter : delimiterPart.toCharArray()) { + delimiterCharPosition++; + ArrayList multiTokenBitmaskPerDelimiterIndex = tokenBoundaryCharToMultiTokenBitmaskPerIndex.computeIfAbsent( + tokenBoundaryCharacter, + input -> new ArrayList<>() + ); + fillListUpToIndex(multiTokenBitmaskPerDelimiterIndex, delimiterCharPosition, () -> 0); + multiTokenBitmaskPerDelimiterIndex.set( + delimiterCharPosition, + multiTokenBitmaskPerDelimiterIndex.get(delimiterCharPosition) | multiTokenBitmask + ); + } + } + maxDelimiterPartsLength = Math.max(maxDelimiterPartsLength, delimiterCharPosition + 1); + fillListUpToIndex(multiTokenBitmaskPerDelimiterPartsLengths, delimiterCharPosition, () -> 0); + multiTokenBitmaskPerDelimiterPartsLengths.set( + delimiterCharPosition, + multiTokenBitmaskPerDelimiterPartsLengths.get(delimiterCharPosition) | multiTokenBitmask + ); + } + multiTokenBitmaskRegistry.seal(); + + int maxSubTokensPerToken = 0; + Map> subTokenTypeToTokenBitmaskByPosition = new HashMap<>(); + // a way to get the positions of subTokens and the corresponding delimiter characters within all token formats + Map< + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType, + Map>> subTokenTypeToDelimiterCharToPositions = new HashMap<>(); + // a way to get the indices of the last subTokens within all token formats + Map> subTokenTypeToLastSubTokenIndex = + new HashMap<>(); + BitmaskRegistry tokenBitmaskRegistry = new BitmaskRegistry<>(); + + // register generic token types first, as they would have the lowest priority + // generic token types are not part of multi-token types, so they are registered with an empty multi-token bitmask + + // the double generic token type can span unknown number of tokens (e.g., "3.14" or "-1.06-e10") + int doubleTokenBitmask = tokenBitmaskRegistry.register( + new TokenType(DOUBLE_SUBTOKEN_NAME, EncodingType.DOUBLE, -1, null, new int[maxTokensPerMultiToken]) + ); + // floating point numbers can have 2 or 3 sub-tokens (e.g. "3.14" has 2 sub-tokens, "-1.06-e10" has 3 sub-tokens) + updateBitmaskToCount(subTokenCountToTokenBitmaskMap, 2, doubleTokenBitmask); + updateBitmaskToCount(subTokenCountToTokenBitmaskMap, 3, doubleTokenBitmask); + + tokenBitmaskRegistry.register(new TokenType(HEX_SUBTOKEN_NAME, EncodingType.HEX, 1, null, new int[maxTokensPerMultiToken])); + tokenBitmaskRegistry.register(new TokenType(INTEGER_SUBTOKEN_NAME, EncodingType.INTEGER, 1, null, new int[maxTokensPerMultiToken])); + + for (org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType tokenType : schema.getTokenTypes()) { + ArrayList multiTokenBitmaskByPositionList = tokenTypeToMultiTokenBitmaskByPosition.get(tokenType.name()); + int[] multiTokenBitmaskByPosition; + if (multiTokenBitmaskByPositionList == null) { + multiTokenBitmaskByPosition = new int[maxTokensPerMultiToken]; + } else { + fillListUpToIndex(multiTokenBitmaskByPositionList, maxTokensPerMultiToken - 1, () -> 0); + multiTokenBitmaskByPosition = new int[multiTokenBitmaskByPositionList.size()]; + for (int i = 0; i < multiTokenBitmaskByPositionList.size(); i++) { + multiTokenBitmaskByPosition[i] = multiTokenBitmaskByPositionList.get(i); + } + } + + TimestampFormat timestampFormat = null; + if (tokenType.encodingType() == EncodingType.TIMESTAMP) { + timestampFormat = createTimestampFormat(tokenType); + } + + int subTokenCount = tokenType.getNumberOfSubTokens(); + int tokenBitmask = tokenBitmaskRegistry.register( + new TokenType(tokenType.name(), tokenType.encodingType(), subTokenCount, timestampFormat, multiTokenBitmaskByPosition) + ); + + updateBitmaskToCount(subTokenCountToTokenBitmaskMap, subTokenCount, tokenBitmask); + + maxSubTokensPerToken = Math.max(maxSubTokensPerToken, subTokenCount); + TokenFormat format = tokenType.format(); + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType[] subTokenTypes = format.getSubTokenTypes(); + for (int i = 0; i < subTokenTypes.length; i++) { + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType subTokenType = subTokenTypes[i]; + String subTokenName = subTokenType.name(); + updateBitmaskByPosition(subTokenTypeToTokenBitmaskByPosition, subTokenName, i, tokenBitmask); + + char[] subTokenDelimiters = format.getSubTokenDelimiters(); + if (i < subTokenDelimiters.length) { + char subTokenDelimiter = subTokenDelimiters[i]; + ArrayList tokenBitmaskPerSubTokenIndex = delimiterCharToTokenBitmaskPerSubTokenIndex.computeIfAbsent( + subTokenDelimiter, + input -> new ArrayList<>() + ); + fillListUpToIndex(tokenBitmaskPerSubTokenIndex, i, () -> 0); + tokenBitmaskPerSubTokenIndex.set(i, tokenBitmaskPerSubTokenIndex.get(i) | tokenBitmask); + + Map> delimiterCharToPositions = subTokenTypeToDelimiterCharToPositions.computeIfAbsent( + subTokenType, + input -> new HashMap<>() + ); + Set positions = delimiterCharToPositions.computeIfAbsent(subTokenDelimiter, input -> new HashSet<>()); + positions.add(i); + } else { + fillListUpToIndex(tokenBitmaskForLastSubToken, i, () -> 0); + tokenBitmaskForLastSubToken.set(i, tokenBitmaskForLastSubToken.get(i) | tokenBitmask); + + Set lastSubTokenIndices = subTokenTypeToLastSubTokenIndex.computeIfAbsent( + subTokenType, + input -> new HashSet<>() + ); + lastSubTokenIndices.add(i); + } + } + } + tokenBitmaskRegistry.seal(); + + BitmaskRegistry subTokenBitmaskRegistry = new BitmaskRegistry<>(); + + ArrayList subTokenBaseTypes = schema.getSubTokenBaseTypes(); + + // Register generic subToken types first, as they would have the lowest priority + + int[] doubleTokenBitmaskByPosition = new int[maxSubTokensPerToken]; + doubleTokenBitmaskByPosition[0] = doubleTokenBitmask; + doubleTokenBitmaskByPosition[1] = doubleTokenBitmask; + doubleTokenBitmaskByPosition[2] = doubleTokenBitmask; + int doubleSubTokenBitmask = subTokenBitmaskRegistry.register( + new SubTokenType(DOUBLE_SUBTOKEN_NAME, EncodingType.DOUBLE, doubleTokenBitmaskByPosition, TimestampComponentType.NA) + ); + genericSubTokenTypesBitmask |= doubleSubTokenBitmask; + allSubTokenBitmask |= doubleSubTokenBitmask; + SubTokenBaseType doubleSubTokenBaseType = subTokenBaseTypes.stream() + .filter(baseType -> baseType.name().equals("double")) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("Double subToken base type not found in schema")); + updateCharToSubTokenBitmasks( + DOUBLE_SUBTOKEN_NAME, + charToSubTokenBitmask, + doubleSubTokenBaseType.allowedCharacters(), + doubleSubTokenBitmask + ); + + int hexSubTokenBitmask = subTokenBitmaskRegistry.register( + new SubTokenType(HEX_SUBTOKEN_NAME, EncodingType.HEX, new int[maxSubTokensPerToken], TimestampComponentType.NA) + ); + genericSubTokenTypesBitmask |= hexSubTokenBitmask; + allSubTokenBitmask |= hexSubTokenBitmask; + SubTokenBaseType hexSubTokenBaseType = subTokenBaseTypes.stream() + .filter(baseType -> baseType.name().equals("hexadecimal")) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("Hex subToken base type not found in schema")); + updateCharToSubTokenBitmasks(HEX_SUBTOKEN_NAME, charToSubTokenBitmask, hexSubTokenBaseType.allowedCharacters(), hexSubTokenBitmask); + + intSubTokenBitmask = subTokenBitmaskRegistry.register( + new SubTokenType(INTEGER_SUBTOKEN_NAME, EncodingType.INTEGER, new int[maxSubTokensPerToken], TimestampComponentType.NA) + ); + allIntegerSubTokenBitmask |= intSubTokenBitmask; + genericSubTokenTypesBitmask |= intSubTokenBitmask; + allSubTokenBitmask |= intSubTokenBitmask; + SubTokenBaseType integerSubTokenBaseType = subTokenBaseTypes.stream() + .filter(baseType -> baseType.name().equals("unsigned_integer")) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("Integer subToken base type not found in schema")); + updateCharToSubTokenBitmasks( + INTEGER_SUBTOKEN_NAME, + charToSubTokenBitmask, + integerSubTokenBaseType.allowedCharacters(), + intSubTokenBitmask + ); + + int[] smallIntegerSubTokenBitmasks = new int[SMALL_INTEGERS_MAX_VALUE + 1]; + for (int i = 0; i <= SMALL_INTEGERS_MAX_VALUE; i++) { + smallIntegerSubTokenBitmasks[i] = genericSubTokenTypesBitmask; + } + ArrayList intRangeBitmasks = new ArrayList<>(); + + for (org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType subTokenType : schema.getSubTokenTypes()) { + ArrayList tokenBitmaskByPositionList = subTokenTypeToTokenBitmaskByPosition.get(subTokenType.name()); + int[] tokenBitmaskByPosition; + if (tokenBitmaskByPositionList == null) { + tokenBitmaskByPosition = new int[maxSubTokensPerToken]; + } else { + fillListUpToIndex(tokenBitmaskByPositionList, maxSubTokensPerToken - 1, () -> 0); + tokenBitmaskByPosition = new int[tokenBitmaskByPositionList.size()]; + for (int i = 0; i < tokenBitmaskByPositionList.size(); i++) { + tokenBitmaskByPosition[i] = tokenBitmaskByPositionList.get(i); + } + } + EncodingType encodingType = subTokenType.encodingType(); + int subTokenBitmask = subTokenBitmaskRegistry.register( + new SubTokenType(subTokenType.name(), encodingType, tokenBitmaskByPosition, subTokenType.getTimestampComponentType()) + ); + if (encodingType == EncodingType.INTEGER) { + allIntegerSubTokenBitmask |= subTokenBitmask; + } + updateCharToSubTokenBitmasks(subTokenType.name(), charToSubTokenBitmask, subTokenType.getValidCharacters(), subTokenBitmask); + allSubTokenBitmask |= subTokenBitmask; + + IntConstraint intConstraint = subTokenType.getIntConstraint(); + if (intConstraint != null) { + for (int i = 0; i < smallIntegerSubTokenBitmasks.length; i++) { + if (intConstraint.isApplicable(i)) { + smallIntegerSubTokenBitmasks[i] |= subTokenBitmask; + } + } + for (IntConstraints.Range range : intConstraint.trueRanges()) { + intRangeBitmasks.add(new IntRangeBitmask(range, subTokenBitmask)); + } + } + + StringConstraint stringConstraint = subTokenType.getStringConstraint(); + if (stringConstraint != null) { + Map> delimiterCharToPositions = subTokenTypeToDelimiterCharToPositions.get(subTokenType); + if (delimiterCharToPositions != null) { + for (Map.Entry> entry : delimiterCharToPositions.entrySet()) { + char subTokenDelimiter = entry.getKey(); + ArrayList bitmaskGeneratorPerSubTokenIndex = + delimiterCharToBitmaskGeneratorPerSubTokenIndex.computeIfAbsent(subTokenDelimiter, input -> new ArrayList<>()); + Set positions = entry.getValue(); + fillListUpToIndex(bitmaskGeneratorPerSubTokenIndex, Collections.max(positions), () -> null); + addConstraintToChain(bitmaskGeneratorPerSubTokenIndex, subTokenBitmask, stringConstraint, positions); + } + } + // for the last subToken, we use the token delimiter to identify it, so we add it to the subTokenEvaluatorForLastSubToken + Set lastPositionIndices = subTokenTypeToLastSubTokenIndex.get(subTokenType); + if (lastPositionIndices != null) { + fillListUpToIndex(bitmaskGeneratorForLastSubToken, Collections.max(lastPositionIndices), () -> null); + addConstraintToChain(bitmaskGeneratorForLastSubToken, subTokenBitmask, stringConstraint, lastPositionIndices); + } + + // add mapping for string subTokens that map specific string values to specific numeric values + if (stringConstraint instanceof StringToIntMapConstraint(Map map)) { + map.forEach((key, value) -> { + Integer existing = subTokenValueToBitmaskMapBuilder.get(key); + if (existing != null) { + throw new IllegalArgumentException( + "SubToken value '" + + key + + "' is mapped to multiple numeric values: " + + existing + + " and " + + value + + ". Each subToken value can only be mapped to a single bitmask." + ); + } else { + subTokenValueToBitmaskMapBuilder.add(key, value); + } + }); + } + } + } + subTokenBitmaskRegistry.seal(); + + // taking care of delimiter characters related to floating point numbers like '.' and '-' + ArrayList dashBitmaskPerSubTokenCount = delimiterCharToTokenBitmaskPerSubTokenIndex.computeIfAbsent( + '-', + input -> new ArrayList<>() + ); + fillListUpToIndex(dashBitmaskPerSubTokenCount, 1, () -> 0); + dashBitmaskPerSubTokenCount.set(0, dashBitmaskPerSubTokenCount.get(0) | doubleSubTokenBitmask); + dashBitmaskPerSubTokenCount.set(1, dashBitmaskPerSubTokenCount.get(1) | doubleSubTokenBitmask); + ArrayList dotBitmaskPerSubTokenCount = delimiterCharToTokenBitmaskPerSubTokenIndex.computeIfAbsent( + '.', + input -> new ArrayList<>() + ); + fillListUpToIndex(dotBitmaskPerSubTokenCount, 0, () -> 0); + dotBitmaskPerSubTokenCount.set(0, dotBitmaskPerSubTokenCount.getFirst() | doubleSubTokenBitmask); + + // update floating point bitmask also for last sub-token for 1, 2, and 3 sub-tokens + tokenBitmaskForLastSubToken.set(0, tokenBitmaskForLastSubToken.get(0) | doubleSubTokenBitmask); + tokenBitmaskForLastSubToken.set(1, tokenBitmaskForLastSubToken.get(1) | doubleSubTokenBitmask); + tokenBitmaskForLastSubToken.set(2, tokenBitmaskForLastSubToken.get(2) | doubleSubTokenBitmask); + + CharSpecificParsingInfo[] charSpecificParsingInfos = new CharSpecificParsingInfo[ASCII_RANGE]; + + for (char delimiter : schema.getSubTokenDelimiters()) { + if (delimiter < ASCII_RANGE) { + // subToken delimiter characters should be mapped to the inclusive subToken bitmask, as they are valid to all subToken types + charToSubTokenBitmask[delimiter] = allSubTokenBitmask; + ArrayList tokenBitmaskPerSubTokenIndexArray = delimiterCharToTokenBitmaskPerSubTokenIndex.get(delimiter); + int[] tokenBitmaskPerSubTokenIndex = new int[maxSubTokensPerToken]; + if (tokenBitmaskPerSubTokenIndexArray != null) { + fillListUpToIndex(tokenBitmaskPerSubTokenIndexArray, maxSubTokensPerToken - 1, () -> 0); + for (int i = 0; i < tokenBitmaskPerSubTokenIndexArray.size(); i++) { + tokenBitmaskPerSubTokenIndex[i] = tokenBitmaskPerSubTokenIndexArray.get(i); + } + tokenBitmaskPerSubTokenIndex = tokenBitmaskPerSubTokenIndexArray.stream().mapToInt(Integer::intValue).toArray(); + } + ArrayList subTokenBitmaskGeneratorPerSubTokenIndexList = + delimiterCharToBitmaskGeneratorPerSubTokenIndex.get(delimiter); + ToIntFunction[] subTokenBitmaskGeneratorPerSubTokenIndices = turnChainBuilderListToFunctionArray( + subTokenBitmaskGeneratorPerSubTokenIndexList, + maxSubTokensPerToken + ); + charSpecificParsingInfos[delimiter] = new CharSpecificParsingInfo( + delimiter, + tokenBitmaskPerSubTokenIndex, + subTokenBitmaskGeneratorPerSubTokenIndices, + null + ); + } else { + throw new IllegalArgumentException( + "SubToken delimiter character '" + delimiter + "' is outside the ASCII range and will not be processed." + ); + } + } + + for (char tokenBoundaryCharacter : schema.getTokenBoundaryCharacters()) { + if (tokenBoundaryCharacter < ASCII_RANGE) { + int[] multiTokenBitmaskPerBoundaryCharIndex = getMultiTokenBitmaskPerBoundaryCharIndex( + tokenBoundaryCharacter, + tokenBoundaryCharToMultiTokenBitmaskPerIndex, + maxDelimiterPartsLength + ); + CharSpecificParsingInfo tokenBoundaryCharParsingInfo = new CharSpecificParsingInfo( + tokenBoundaryCharacter, + null, + null, + multiTokenBitmaskPerBoundaryCharIndex + ); + charSpecificParsingInfos[tokenBoundaryCharacter] = tokenBoundaryCharParsingInfo; + } else { + throw new IllegalArgumentException( + "Token boundary character '" + tokenBoundaryCharacter + "' is outside the ASCII range and will not be processed." + ); + } + } + + int[] tokenBitmaskPerSubTokenIndex = tokenBitmaskForLastSubToken.stream().mapToInt(Integer::intValue).toArray(); + ToIntFunction[] subTokenBitmaskGeneratorForLastIndex = turnChainBuilderListToFunctionArray( + bitmaskGeneratorForLastSubToken, + maxSubTokensPerToken + ); + for (char delimiter : schema.getTokenDelimiters()) { + if (delimiter < ASCII_RANGE) { + // token delimiter characters are also subToken delimiters (that delimit between the last subToken to the next token) + // and therefore should be mapped to the inclusive subToken bitmask, as they are valid to all subToken types + charToSubTokenBitmask[delimiter] = allSubTokenBitmask; + + int[] multiTokenBitmaskPerBoundaryCharIndex = getMultiTokenBitmaskPerBoundaryCharIndex( + delimiter, + tokenBoundaryCharToMultiTokenBitmaskPerIndex, + maxDelimiterPartsLength + ); + CharSpecificParsingInfo tokenDelimiterCharParsingInfo = new CharSpecificParsingInfo( + delimiter, + tokenBitmaskPerSubTokenIndex, + subTokenBitmaskGeneratorForLastIndex, + multiTokenBitmaskPerBoundaryCharIndex + ); + charSpecificParsingInfos[delimiter] = tokenDelimiterCharParsingInfo; + } else { + throw new IllegalArgumentException( + "Token delimiter character '" + delimiter + "' is outside the ASCII range and will not be processed." + ); + } + } + + for (char tokenBoundaryChar : schema.getTokenBoundaryCharacters()) { + if (tokenBoundaryChar < ASCII_RANGE) { + // token boundary characters should not invalidate any sub-token, so we use the inclusive sub-token bitmask + charToSubTokenBitmask[tokenBoundaryChar] = allSubTokenBitmask; + } else { + throw new IllegalArgumentException( + "Token boundary character '" + tokenBoundaryChar + "' is outside the ASCII range and will not be processed." + ); + } + } + + int[] subTokenCountToTokenBitmask = new int[maxSubTokensPerToken]; + for (Map.Entry entry : subTokenCountToTokenBitmaskMap.entrySet()) { + int subTokenCount = entry.getKey(); + int bitmask = entry.getValue(); + if (subTokenCount <= subTokenCountToTokenBitmask.length) { + subTokenCountToTokenBitmask[subTokenCount - 1] = bitmask; + } else { + throw new IllegalStateException( + "Sub-token count " + + subTokenCount + + " exceeds the size of the subTokenCountToTokenBitmask array (" + + subTokenCountToTokenBitmask.length + + "). This may lead to unexpected behavior." + ); + } + } + + int[] tokenCountToMultiTokenBitmask = new int[maxTokensPerMultiToken]; + for (Map.Entry entry : tokenCountToMultiTokenBitmaskMap.entrySet()) { + int tokenCount = entry.getKey(); + int bitmask = entry.getValue(); + if (tokenCount <= tokenCountToMultiTokenBitmask.length) { + tokenCountToMultiTokenBitmask[tokenCount - 1] = bitmask; + } else { + throw new IllegalStateException( + "Token count " + + tokenCount + + " exceeds the size of the tokenCountToMultiTokenBitmaskArray (" + + tokenCountToMultiTokenBitmask.length + + "). This may lead to unexpected behavior." + ); + } + } + + int[] subTokenCountToMultiTokenBitmask = new int[maxSubTokensPerMultiToken]; + for (Map.Entry entry : subTokenCountToMultiTokenBitmaskMap.entrySet()) { + int subTokenCount = entry.getKey(); + int bitmask = entry.getValue(); + if (subTokenCount <= subTokenCountToMultiTokenBitmask.length) { + subTokenCountToMultiTokenBitmask[subTokenCount - 1] = bitmask; + } else { + throw new IllegalStateException( + "Sub-token count " + + subTokenCount + + " exceeds the size of the subTokenCountToMultiTokenBitmaskArray (" + + subTokenCountToMultiTokenBitmask.length + + "). This may lead to unexpected behavior." + ); + } + } + + int[] delimiterLengthToMultiTokenBitmask = new int[maxDelimiterPartsLength]; + for (int i = 0; i < multiTokenBitmaskPerDelimiterPartsLengths.size(); i++) { + delimiterLengthToMultiTokenBitmask[i] = multiTokenBitmaskPerDelimiterPartsLengths.get(i); + } + + intRangeBitmasks = mergeIntRangeBitmasks(intRangeBitmasks); + int[] integerSubTokenBitmaskArrayRanges = new int[intRangeBitmasks.size()]; + int[] integerSubTokenBitmasks = new int[intRangeBitmasks.size()]; + for (int i = 0; i < intRangeBitmasks.size(); i++) { + IntRangeBitmask intRangeBitmask = intRangeBitmasks.get(i); + integerSubTokenBitmaskArrayRanges[i] = intRangeBitmask.range().upperBound(); + // ensure that the generic subToken (int, double and hex) bits are always set, as they apply to all integer values + integerSubTokenBitmasks[i] = intRangeBitmask.bitmask() | genericSubTokenTypesBitmask; + } + + return new CompiledSchema( + charToSubTokenBitmask, + charToCharType, + charSpecificParsingInfos, + subTokenValueToBitmaskMapBuilder.build(), + maxSubTokensPerToken, + maxTokensPerMultiToken, + maxSubTokensPerMultiToken, + intSubTokenBitmask, + allIntegerSubTokenBitmask, + genericSubTokenTypesBitmask, + integerSubTokenBitmasks, + integerSubTokenBitmaskArrayRanges, + smallIntegerSubTokenBitmasks, + subTokenCountToTokenBitmask, + tokenCountToMultiTokenBitmask, + subTokenCountToMultiTokenBitmask, + delimiterLengthToMultiTokenBitmask, + subTokenBitmaskRegistry, + tokenBitmaskRegistry, + multiTokenBitmaskRegistry + ); + } + + private static void updateBitmaskByPosition( + Map> typeToHigherTypeBitmaskByPosition, + String lowerTypeName, + int i, + int tokenBitmask + ) { + ArrayList bitmaskList = typeToHigherTypeBitmaskByPosition.computeIfAbsent(lowerTypeName, input -> new ArrayList<>()); + fillListUpToIndex(bitmaskList, i, () -> 0); + bitmaskList.set(i, bitmaskList.get(i) | tokenBitmask); + } + + private static void updateBitmaskToCount(Map typeCountToHigherTypeBitmask, int subTokenCount, int tokenBitmask) { + int bitmaskForSubTokenCount = typeCountToHigherTypeBitmask.computeIfAbsent(subTokenCount, input -> 0); + bitmaskForSubTokenCount |= tokenBitmask; + typeCountToHigherTypeBitmask.put(subTokenCount, bitmaskForSubTokenCount); + } + + private static int[] getMultiTokenBitmaskPerBoundaryCharIndex( + char tokenBoundaryCharacter, + Map> tokenBoundaryCharToMultiTokenBitmaskPerIndex, + int maxDelimiterPartsLength + ) { + int[] multiTokenBitmaskPerBoundaryCharIndex = null; + ArrayList multiTokenBitmaskPerBoundaryCharIndexArray = tokenBoundaryCharToMultiTokenBitmaskPerIndex.get( + tokenBoundaryCharacter + ); + if (multiTokenBitmaskPerBoundaryCharIndexArray != null) { + multiTokenBitmaskPerBoundaryCharIndex = new int[maxDelimiterPartsLength]; + fillListUpToIndex(multiTokenBitmaskPerBoundaryCharIndexArray, maxDelimiterPartsLength - 1, () -> 0); + for (int i = 0; i < multiTokenBitmaskPerBoundaryCharIndexArray.size(); i++) { + multiTokenBitmaskPerBoundaryCharIndex[i] = multiTokenBitmaskPerBoundaryCharIndexArray.get(i); + } + } + return multiTokenBitmaskPerBoundaryCharIndex; + } + + /** + * Adds a string constraint to the all chains at the specified positions. + * The addition step actually includes the compilation of the constraint into a runtime evaluation function. + * @param chainForIndex the list of chains, one per subToken index + * @param subTokenBitmask the subToken bitmask associated with the constraint + * @param stringConstraint the string constraint to compile and add + * @param positionIndices the delimiter positions within the subToken type where the constraint should be added + */ + private static void addConstraintToChain( + ArrayList chainForIndex, + int subTokenBitmask, + StringConstraint stringConstraint, + Set positionIndices + ) { + positionIndices.forEach(position -> { + SubstringToBitmaskChain.Builder chainBuilder = chainForIndex.get(position); + if (chainBuilder == null) { + chainBuilder = SubstringToBitmaskChain.builder(); + chainForIndex.set(position, chainBuilder); + } + chainBuilder.add(stringConstraint, subTokenBitmask); + }); + } + + public static byte getCharCode(char c, Schema schema) { + if (Character.isDigit(c)) { + return DIGIT_CHAR_CODE; + } else if (Character.isAlphabetic(c)) { + return ALPHABETIC_CHAR_CODE; + } + + for (char delim : schema.getSubTokenDelimiters()) { + if (c == delim) { + return SUBTOKEN_DELIMITER_CHAR_CODE; + } + } + + for (char delim : schema.getTokenDelimiters()) { + if (c == delim) { + return TOKEN_DELIMITER_CHAR_CODE; + } + } + + for (char tokenBoundaryChar : schema.getTokenBoundaryCharacters()) { + if (c == tokenBoundaryChar) { + return TOKEN_BOUNDARY_CHAR_CODE; + } + } + + return OTHER_CHAR_CODE; + } + + private static void updateCharToSubTokenBitmasks( + String subTokenTypeName, + int[] charToSubTokenBitmask, + char[] validCharacters, + int subTokenBitmask + ) { + for (char c : validCharacters) { + if (c < ASCII_RANGE) { + charToSubTokenBitmask[c] |= subTokenBitmask; + } else { + throw new IllegalArgumentException( + "Character '" + + c + + "' in subToken type '" + + subTokenTypeName + + "' is outside the ASCII range and will not be processed." + ); + } + } + } + + private static void fillListUpToIndex(ArrayList list, int index, Supplier supplier) { + while (list.size() <= index) { + list.addLast(supplier.get()); + } + } + + @SuppressWarnings("unchecked") + private static ToIntFunction[] turnChainBuilderListToFunctionArray( + ArrayList buildersList, + int maxSubTokensPerToken + ) { + if (buildersList != null) { + ArrayList> tmpList = new ArrayList<>(); + fillListUpToIndex(tmpList, maxSubTokensPerToken - 1, () -> null); + for (int i = 0; i < buildersList.size(); i++) { + SubstringToBitmaskChain.Builder chainBuilder = buildersList.get(i); + if (chainBuilder != null) { + tmpList.set(i, chainBuilder.build()); + } + } + return tmpList.toArray(ToIntFunction[]::new); + } + return null; + } + + // ===================================================== int ranges ============================================================= + + /** + * Merges overlapping ranges in a list of {@link IntRangeBitmask} instances by combining their bitmasks using a bitwise OR operation. + * The resulting list will contain ordered non-overlapping ranges of integers, where each range is associated with a single bitmask. + * @param intRangeBitmasks the list of IntRangeBitmask to reduce + * @return an ordered list of non-overlapping {@link IntRangeBitmask} instances representing the merged ranges + */ + static ArrayList mergeIntRangeBitmasks(ArrayList intRangeBitmasks) { + ArrayList boundaries = new ArrayList<>(); + for (IntRangeBitmask intRangeBitmask : intRangeBitmasks) { + boundaries.add(new RangeBoundary(intRangeBitmask.range().lowerBound(), true, intRangeBitmask.bitmask())); + boundaries.add(new RangeBoundary(intRangeBitmask.range().upperBound(), false, intRangeBitmask.bitmask())); + } + // sort boundaries by boundary value, lower bounds first + // noinspection Java8ListSort - cannot use List.sort here due to poor checkstyle algorithm + Collections.sort( + boundaries, + Comparator.comparingInt(RangeBoundary::boundary).thenComparing(RangeBoundary::isLowerBound, Comparator.reverseOrder()) + ); + + ArrayList reducedRanges = new ArrayList<>(); + + int accumulatedBitmask = 0; + Integer activeRangeStart = Integer.MIN_VALUE; + Integer activeRangeEnd = null; + Integer activeRangeBitmask = null; + + for (RangeBoundary rangeBoundary : boundaries) { + int boundary = rangeBoundary.boundary(); + + // if there is already a finished current range, add it to the result + if (activeRangeEnd != null && boundary > activeRangeEnd) { + reducedRanges.addLast(IntRangeBitmask.of(activeRangeStart, activeRangeEnd, activeRangeBitmask)); + // reset the current range + activeRangeStart = activeRangeEnd + 1; + activeRangeEnd = null; + activeRangeBitmask = null; + } + + if (rangeBoundary.isLowerBound()) { + // enter a range + if (boundary > activeRangeStart) { + // if the boundary is greater than the current range start, we should close this range and add it to the result + reducedRanges.addLast(IntRangeBitmask.of(activeRangeStart, boundary - 1, accumulatedBitmask)); + // start a new range from the current boundary + activeRangeStart = boundary; + } + // accumulate the bitmask for the current range + accumulatedBitmask |= rangeBoundary.bitmask(); + } else { + // exit a range + activeRangeEnd = boundary; + if (activeRangeBitmask == null) { + activeRangeBitmask = accumulatedBitmask; + } + // remove the range's bitmask from the accumulated bitmask + accumulatedBitmask &= ~rangeBoundary.bitmask(); + } + } + if (activeRangeEnd != null) { + reducedRanges.addLast(IntRangeBitmask.of(activeRangeStart, activeRangeEnd, activeRangeBitmask)); + if (activeRangeEnd < Integer.MAX_VALUE) { + activeRangeStart = activeRangeEnd + 1; + } else { + activeRangeStart = null; + } + } + if (activeRangeStart != null) { + reducedRanges.addLast(IntRangeBitmask.of(activeRangeStart, Integer.MAX_VALUE, accumulatedBitmask)); + } + return reducedRanges; + } + + record IntRangeBitmask(IntConstraints.Range range, int bitmask) { + static IntRangeBitmask of(int lowerBound, int upperBound, int bitmask) { + return new IntRangeBitmask(new IntConstraints.Range(lowerBound, upperBound), bitmask); + } + + @SuppressWarnings("NullableProblems") + @Override + public String toString() { + return "IntRangeBitmask{" + "range=" + range + ", bitmask=" + Integer.toBinaryString(bitmask) + '}'; + } + + @Override + public boolean equals(Object other) { + if (this == other) return true; + if (other instanceof IntRangeBitmask) { + @SuppressWarnings("PatternVariableCanBeUsed") // cannot use pattern variable due to poor checkstyle algorithm + IntRangeBitmask otherRange = (IntRangeBitmask) other; + return range.equals(otherRange.range()) && bitmask == otherRange.bitmask(); + } + return false; + } + + @Override + public int hashCode() { + return range.lowerBound() + range.upperBound() + bitmask; + } + } + + private record RangeBoundary(int boundary, boolean isLowerBound, int bitmask) {} + + // =================================================== Timestamp formatting ======================================================== + + /** + * Creates a {@link TimestampFormat} from a {@link MultiTokenFormat} object that represents timestamp components. + * This method processes both token parts and literal string parts to construct the final Java time format. + * @param format The MultiTokenFormat object representing the timestamp format. + * @return A TimestampFormat object containing the format string and an array indicating the order of timestamp components. + */ + static TimestampFormat createTimestampFormat(MultiTokenFormat format) { + List delimiterParts = format.getDelimiterParts(); + List tokens = format.getTokens(); + if (delimiterParts.size() != tokens.size() - 1) { + throw new IllegalArgumentException( + "Invalid MultiTokenFormat: number of delimiter parts (" + + delimiterParts.size() + + ") must be one less than number of tokens (" + + tokens.size() + + ")." + ); + } + + boolean isUsingAmPm = tokens.stream() + .anyMatch(token -> Arrays.stream(token.format().getSubTokenTypes()).anyMatch(subToken -> subToken.name().equals("AP"))); + + int[] timestampComponentsOrder = new int[TimestampComponentType.values().length]; + Arrays.fill(timestampComponentsOrder, -1); + int nextComponentIndex = 0; + StringBuilder javaTimeFormat = new StringBuilder(); + for (int i = 0; i < tokens.size(); i++) { + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType token = tokens.get(i); + StringBuilder tokenJavaTimeFormat = new StringBuilder(); + nextComponentIndex += appendTimestampComponents( + token, + tokenJavaTimeFormat, + timestampComponentsOrder, + nextComponentIndex, + isUsingAmPm + ); + javaTimeFormat.append(tokenJavaTimeFormat); + if (i < delimiterParts.size()) { + String delimiterPart = delimiterParts.get(i); + for (char c : delimiterPart.toCharArray()) { + appendDelimiter(javaTimeFormat, c); + } + } + } + return new TimestampFormat(javaTimeFormat.toString(), timestampComponentsOrder); + } + + /** + * Creates a {@link TimestampFormat} from a single {@link org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType} + * object that represents a timestamp format. + * @param timestampToken A TokenType object representing the timestamp format. + * @return A TimestampFormat object containing the format string and an array indicating the order of timestamp components. + */ + static TimestampFormat createTimestampFormat(org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType timestampToken) { + StringBuilder javaTimeFormat = new StringBuilder(); + int[] timestampComponentsOrder = new int[TimestampComponentType.values().length]; + Arrays.fill(timestampComponentsOrder, -1); + boolean isUsingAmPm = Arrays.stream(timestampToken.format().getSubTokenTypes()).anyMatch(subToken -> subToken.name().equals("AP")); + appendTimestampComponents(timestampToken, javaTimeFormat, timestampComponentsOrder, 0, isUsingAmPm); + return new TimestampFormat(javaTimeFormat.toString(), timestampComponentsOrder); + } + + /** + * Appends the details of a given token to the provided javaTimeFormat and updates the timestampComponentsOrder array to reflect the + * order of timestamp components. + * @param token the TokenType object representing the timestamp format + * @param javaTimeFormat the StringBuilder to append the Java time format string + * @param timestampComponentsOrder an array to store the order of timestamp components + * @param nextComponentIndex the next index to use in the timestampComponentsOrder array + * @return the number of timestamp components appended to the javaTimeFormat + */ + private static int appendTimestampComponents( + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType token, + StringBuilder javaTimeFormat, + int[] timestampComponentsOrder, + int nextComponentIndex, + boolean isUsingAmPm + ) { + StringBuilder tokenJavaTimeFormat = new StringBuilder(); + TokenFormat format = token.format(); + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType[] subTokenTypes = format.getSubTokenTypes(); + char[] delimiters = format.getSubTokenDelimiters(); + + int appendedComponents = 0; + for (int i = 0; i < subTokenTypes.length; i++) { + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType subToken = subTokenTypes[i]; + TimestampComponentType componentType = subToken.getTimestampComponentType(); + if (componentType != TimestampComponentType.NA) { + timestampComponentsOrder[componentType.getCode()] = i + nextComponentIndex; + tokenJavaTimeFormat.append(mapSubTokenTypeToPattern(subToken, isUsingAmPm)); + if (i < delimiters.length) { + appendDelimiter(tokenJavaTimeFormat, delimiters[i]); + } + appendedComponents++; + } + } + javaTimeFormat.append(tokenJavaTimeFormat); + return appendedComponents; + } + + private static void appendDelimiter(StringBuilder builder, char delimiter) { + // Escape characters that have special meaning in DateTimeFormatter patterns + if ("'[]#{}T".indexOf(delimiter) != -1) { + builder.append('\'').append(delimiter).append('\''); + } else { + builder.append(delimiter); + } + } + + private static String mapSubTokenTypeToPattern( + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType subTokenType, + boolean useAmPm + ) { + TimestampComponentType componentType = subTokenType.getTimestampComponentType(); + if (componentType == TimestampComponentType.NA) { + return ""; + } + + // The mapping is based on the sub-token definitions in schema.yaml + return switch (subTokenType.name()) { + case "YYYY" -> "yyyy"; // 4-digit year + case "MM" -> "MM"; // 2-digit month + case "Mon" -> "MMM"; // 3-letter month abbreviation + case "DD" -> "dd"; // 2-digit day + case "hh" -> useAmPm ? "hh" : "HH"; // 2-digit hour (either 12-hour or 24-hour clock - 0-23) + case "mm" -> "mm"; // 2-digit minute + case "ss" -> "ss"; // 2-digit second + case "ms" -> "SSS"; // 3-digit millisecond + case "us" -> "SSSSSS"; // 6-digit microsecond + case "AP" -> "a"; // AM/PM marker + case "TZA", "TZOhhmm" -> "Z"; // Timezone offset like +0200 or -0500 + case "Day" -> "E"; // Day of week abbreviation + default -> ""; + }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskChain.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskChain.java new file mode 100644 index 0000000000000..0d3d1ab249127 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskChain.java @@ -0,0 +1,141 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringToIntegerMap; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringView; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AndStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AnyString; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.EqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.LengthStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.NotEqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.OrStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringSetConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringToIntMapConstraint; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.ToIntFunction; + +/** + * A chain of {@code ToIntFunction} that can be applied in sequence to produce a bitmask for a given substring. + * Each function in the chain produces a bitmask that is ORed together to produce the final result. + * This is a way of applying multiple {@code StringConstraint} evaluations on a given substring into a single bitmask. + * In order to optimize the evaluation, any map/set based evaluations are combined into a single + * {@link SubstringToIntegerMap} evaluation. In addition, any "any string" constraints are combined into a single bitmask that is always + * applied. + * Conceptually, each function in the chain represents the constraints related to a single sub-token from the schema. Some of these + * may be composite constraints (AND/OR), in which case a single function may represent multiple inner evaluations. Technically, however, + * this is only the case for AND constraints, as OR constraints are decomposed into multiple functions in the chain. + */ +public class SubstringToBitmaskChain implements ToIntFunction { + /** + * Any string evaluation that matches a map/set pattern can be combined into a single map evaluation for efficiency. + */ + private final SubstringToIntegerMap substringToBitmaskMap; + + /** + * A chain of functions to apply in sequence to produce the combined bitmask. + */ + private final ToIntFunction[] chain; + + /** + * A bitmask that matches all strings. + */ + private final int anyStringBitmask; + + @SuppressWarnings("unchecked") + private SubstringToBitmaskChain( + List> chainList, + SubstringToIntegerMap substringToBitmaskMap, + int anyStringBitmask + ) { + this.chain = chainList.toArray(ToIntFunction[]::new); + this.substringToBitmaskMap = substringToBitmaskMap; + this.anyStringBitmask = anyStringBitmask; + } + + @Override + public int applyAsInt(SubstringView value) { + int result = substringToBitmaskMap != null ? substringToBitmaskMap.applyAsInt(value) : 0; + // noinspection ForLoopReplaceableByForEach + for (int i = 0; i < chain.length; i++) { + result |= chain[i].applyAsInt(value); + } + result |= anyStringBitmask; + return result; + } + + public static class Builder { + private final List> chainList; + private final SubstringToIntegerMap.Builder substringToBitmaskMapBuilder; + private int anyStringBitmask; + + private Builder() { + this.chainList = new ArrayList<>(); + this.substringToBitmaskMapBuilder = SubstringToIntegerMap.builder(); + this.anyStringBitmask = 0; + } + + /** + * Add a StringConstraint with the associated bitmask to the chain being built. + * The constraint can be a composite one (AND/OR), in which case it will be decomposed accordingly, in which case it is + * important to distinguish two different steps of runtime evaluation: + *
    + *
  • The evaluation of the compiled form of a composite constraint may include multiple inner evaluations, but should + * always result in a single bitmask. Nevertheless, in case of an OR composite constraint, the inner evaluations can be added + * directly to the chain, as opposed to AND composite constraints, which need to be wrapped in a function that ensures all inner + * evaluations are applied in an AND manner.
  • + *
  • The evaluation of the chain itself involves the application of multiple constraints, each producing a bitmask that is + * ORed together to produce the final result.
  • + *
+ * @param stringConstraint the StringConstraint to add, potentially a composite constraint + * @param bitmask the bitmask associated with the constraint + * @return this builder for method chaining + */ + public Builder add(StringConstraint stringConstraint, int bitmask) { + switch (stringConstraint) { + case StringToIntMapConstraint constraint -> substringToBitmaskMapBuilder.addAll(constraint.map().keySet(), bitmask); + case StringSetConstraint constraint -> substringToBitmaskMapBuilder.addAll(constraint.keys(), bitmask); + case EqualsStringConstraint constraint -> substringToBitmaskMapBuilder.add(constraint.targetValue(), bitmask); + case NotEqualsStringConstraint constraint -> chainList.add(SubstringToBitmaskFunctionFactory.from(bitmask, constraint)); + case LengthStringConstraint constraint -> chainList.add(SubstringToBitmaskFunctionFactory.from(bitmask, constraint)); + case AnyString ignored -> this.anyStringBitmask |= bitmask; + case OrStringConstraint orConstraint -> add(orConstraint.first(), bitmask).add(orConstraint.second(), bitmask); + case AndStringConstraint andConstraint -> chainList.add(SubstringToBitmaskFunctionFactory.from(bitmask, andConstraint)); + default -> throw new IllegalArgumentException( + "Unsupported StringConstraint type: " + stringConstraint.getClass().getSimpleName() + ); + } + return this; + } + + /** + * Build the SubstringToBitmaskChain instance. Try to optimize the result by avoiding unnecessary chaining. + * @return the built SubstringToBitmaskChain instance + */ + public ToIntFunction build() { + SubstringToIntegerMap map = substringToBitmaskMapBuilder.isEmpty() ? null : substringToBitmaskMapBuilder.build(); + if (chainList.isEmpty() && anyStringBitmask == 0) { + return map; + } + if (map == null && anyStringBitmask == 0 && chainList.size() == 1) { + return chainList.getFirst(); + } + if (map == null && chainList.isEmpty()) { + return input -> anyStringBitmask; + } + return new SubstringToBitmaskChain(chainList, map, anyStringBitmask); + } + } + + public static Builder builder() { + return new Builder(); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskFunctionFactory.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskFunctionFactory.java new file mode 100644 index 0000000000000..58f73a2e91f9a --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskFunctionFactory.java @@ -0,0 +1,62 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringToIntegerMap; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringView; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AndStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AnyString; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.EqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.LengthStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.NotEqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.OrStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringSetConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringToIntMapConstraint; + +import java.util.function.ToIntFunction; + +public class SubstringToBitmaskFunctionFactory { + public static ToIntFunction from(int bitmask, StringConstraint stringConstraint) { + return switch (stringConstraint) { + case StringToIntMapConstraint constraint -> SubstringToIntegerMap.builder().addAll(constraint.map().keySet(), bitmask).build(); + case StringSetConstraint constraint -> SubstringToIntegerMap.builder().addAll(constraint.keys(), bitmask).build(); + case EqualsStringConstraint constraint -> new ToIntFunction<>() { + private final SubstringView target = new SubstringView(constraint.targetValue()); + + @Override + public int applyAsInt(SubstringView input) { + return target.equals(input) ? bitmask : 0; + } + }; + case NotEqualsStringConstraint constraint -> new ToIntFunction<>() { + private final SubstringView target = new SubstringView(constraint.targetValue()); + + @Override + public int applyAsInt(SubstringView input) { + return target.equals(input) ? 0 : bitmask; + } + }; + case LengthStringConstraint constraint -> input -> input.length() == constraint.requiredLength() ? bitmask : 0; + case AnyString ignored -> input -> bitmask; + case OrStringConstraint orConstraint -> SubstringToBitmaskChain.builder() + .add(orConstraint.first(), bitmask) + .add(orConstraint.second(), bitmask) + .build(); + case AndStringConstraint andConstraint -> and(from(bitmask, andConstraint.first()), from(bitmask, andConstraint.second())); + default -> throw new IllegalArgumentException( + "Unsupported StringConstraint type: " + stringConstraint.getClass().getSimpleName() + ); + }; + + } + + private static ToIntFunction and(final ToIntFunction first, final ToIntFunction second) { + return input -> first.applyAsInt(input) & second.applyAsInt(input); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/package-info.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/package-info.java new file mode 100644 index 0000000000000..0e5f5a7499baa --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/package-info.java @@ -0,0 +1,96 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * This package contains the core classes and interfaces for an efficient text parser that extracts patterns from text lines, typically + * from log files. Each line is parsed into a sequence of tokens and subTokens, which are then processed based on a predefined schema. + * The eventual output is an ordered list of {@link org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Argument Argument} + * instances, corresponding to the pattern parameters (including the timestamp), along with their types and positions in the original + * text. + * This output encapsulates all required information in order to reconstruct the template if needed, as well as the corresponding + * parameter values and their offsets. + * + *

Key Concepts
+ * The heart of the system is the {@code schema.yaml} file, which contains a hierarchical definition of patterns to extract, including: + *

    + *
  • Tokens - Space/tab delimited elements in a line of text
  • + *
  • SubTokens - Smaller components within tokens, separated by characters like periods, colons, etc.
  • + *
  • MultiTokens - Sequences of tokens that together form a larger pattern
  • + *
+ * + * The parsing workflow involves: + *
    + *
  1. Loading and compiling the schema into efficient data structures
  2. + *
  3. Analyzing input text character-by-character in a single pass
  4. + *
  5. Identifying tokens and their potential matches against defined patterns
  6. + *
  7. Extracting relevant information based on successful pattern matches into pattern arguments
  8. + *
+ * + * The core of the parser's efficiency comes from its use of bitmasks. Here's how it works: + *
    + *
  • Bit Allocation: During the compilation phase, each sub-token, token, and multi-token type is assigned a unique bit in a 32-bit + * integer.
  • + *
  • Stateful Parsing: The parser processes the input string character by character. At each step, it maintains a set of bitmasks + * that represent the possible types for the current sub-token, token, and multi-token being parsed.
  • + *
  • Elimination: As the parser consumes more characters, it eliminates possibilities by performing bitwise AND operations on the + * bitmasks. For example, if the parser encounters a non-numeric character, it will clear the bits corresponding to all integer-based + * sub-token types.
  • + *
  • Type Identification: When a delimiter is reached, the parser finalizes the type of the preceding token or sub-token by + * identifying the highest-priority bit that is still set in the corresponding bitmask.
  • + *
+ * + *

Performance Principles + *

The parsing schema is compiled into data structures that facilitate efficient parsing, according to the following principles: + *

    + *
  • Ensure linear complexity by enforcing a single character-by-character pass. We may maintain as many parsing states as required + * for detecting template parameters, as long as we avoid backtracking and similar complexity-increasing operations that are used by + * regular expression engines and the like.
  • + *
  • Moreover, complexity should remain linear and additional overhead should be negligible when adding more patterns to detect + * (i.e., when extending the schema).
  • + *
  • Execute minimal and inexpensive operations for most parsed characters and execute heavier computations as rarely as + * possible.
  • + *
  • The former principle can be achieved by eliminating potential matches as early as possible and applying more expensive operations + * only on specific characters (e.g., subToken/token delimiters) and only if they are still required (meaning - only if not all + * options for match were already eliminated).
  • + *
  • Have bias towards using inexpensive computations like bitwise operations or simple calculations. For example, bitmasks provide + * manipulation of multiple states with a single inexpensive bitwise operation.
  • + *
  • Use fast access structures (like arrays) for in-parsing lookups and prefer cache-friendly structures + * (like primitive arrays).
  • + *
  • Use JIT-friendly concepts, like immutable and final classes and short methods (to favor fast inlining).
  • + *
  • Avoid allocations as much as possible in the parsing loop.
  • + *
  • Reduce method calls to a minimum. For example, with careful design of the parsing loop it is possible to only collect + * "decisions" and apply them at the end of the loop, thus avoiding method calls, while still not duplicating code.
  • + *
+ * + *

Usage + *

The main entry point for using the parser is the + * {@link org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParserFactory ParserFactory} class. + * The factory provides a static method to create a new parser instance. + * The parser can then be used to parse text lines and return an ordered list of typed arguments. Each argument includes its type, + * extracted value, and its position within the original text, allowing the caller to construct the template if needed. + * A reference implementation for constructing the template from the original text and the list of arguments is provided by the + * {@link org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser#constructPattern Parser.constructPattern} static method. + * + *

{@code
+ * import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser;
+ * import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParserFactory;
+ * import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.PatternedMessage;
+ *
+ * public class ParserExample {
+ *     public static void main(String[] args) {
+ *         Parser parser = ParserFactory.createParser();
+ *         String logLine = "2023-10-05 14:30:25 INFO received 305 packets from 135.122.123.222";
+ *         List> = parser.parse(logLine);
+ *         StringBuilder pattern = new StringBuilder();
+ *         Parser.constructPattern(logLine, arguments, pattern, true);
+ *         System.out.println(pattern.toString()); // Outputs: "%T INFO received %N packets from %4"
+ *     }
+ * }
+ * }
+ * + */ +package org.elasticsearch.xpack.logsdb.patternedtext.charparser; diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/BitmaskRegistry.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/BitmaskRegistry.java new file mode 100644 index 0000000000000..184daa2e07070 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/BitmaskRegistry.java @@ -0,0 +1,343 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * A registry that allocates and maps bits to {@link ParsingType} instances. + * The order of registration is important because the earlier a type is registered, the lower its priority in the registry. + * This corresponds to the bit index in the bitmask: as the bit index reflects the order of registration, it also reflects the priority of + * the corresponding instance. The first registered instance will have the lowest bit index (0), and thus the lowest priority. + * This means that when examining a bitmask with multiple bits set, the leftmost bit has the highest priority in that bitmask. + * See {@link #getLeftmostBitIndex(int)} and {@link #getHighestPriorityType(int)} as the APIs to retrieve the highest priority type. + * Therefore, register lower priority types (typically - more generic ones) first, and higher priority types later. + * + * @param the type to register + */ +public final class BitmaskRegistry { + private final Map typeToBitmask; + + // array for quick access to instances by bit index + private final T[] typesByBitIndex; + + // a two-dimensional array for quick access to higher level bitmasks by position for each type + // the first dimension is the position, and the second dimension is the type bit index + // for example, higherLevelBitmaskByPosition[2][3] will give the higher level bitmask for the type at bit index 3 at position 2 + private int[][] higherLevelBitmaskByPosition; + + private volatile int nextBitIndex; + private volatile int accumulativeBitmask; + private int combinedBitmask; + + private volatile boolean sealed = false; + + @SuppressWarnings("unchecked") + public BitmaskRegistry() { + this.typeToBitmask = new HashMap<>(); + // noinspection unchecked - we maintain type safety through APIs, the array is private an inaccessible otherwise + this.typesByBitIndex = (T[]) new ParsingType[32]; // 32 bits for integer bitmask + this.nextBitIndex = 0; + this.accumulativeBitmask = 0; + this.combinedBitmask = 0; + } + + /** + * Registers a {@link ParsingType} instance and allocates a bit for it. + * Later registrations will have higher priority. + * Types can only be registered before sealing the registry. See {@link #seal()} for more details. + * + * @param type the type to register + * @return the bitmask for the registered type, where only one bit is set + * @throws IllegalStateException if more than 32 types are registered + * @throws IllegalArgumentException if the type is already registered + */ + public synchronized int register(T type) { + if (sealed) { + throw new IllegalStateException("Cannot register new types after sealing the registry"); + } + + if (nextBitIndex >= 32) { + throw new IllegalStateException("Cannot register more than 32 instances due to integer bit limit"); + } + if (typeToBitmask.containsKey(type)) { + throw new IllegalArgumentException("Type is already registered: " + type); + } + + int bitIndex = nextBitIndex++; + int bitmask = 1 << bitIndex; + + typeToBitmask.put(type, bitmask); + typesByBitIndex[bitIndex] = type; + accumulativeBitmask |= bitmask; + + // update higher level bitmask by position + if (type.higherLevelBitmaskByPosition != null) { + int numPositions = type.higherLevelBitmaskByPosition.length; + ensureHigherLevelBitmaskByPositionCapacity(numPositions - 1); + for (int position = 0; position < numPositions; position++) { + higherLevelBitmaskByPosition[position][bitIndex] = type.higherLevelBitmaskByPosition[position]; + } + } + + return bitmask; + } + + private void ensureHigherLevelBitmaskByPositionCapacity(int position) { + if (higherLevelBitmaskByPosition == null) { + higherLevelBitmaskByPosition = new int[position + 1][32]; + } else if (higherLevelBitmaskByPosition.length <= position) { + int[][] newArray = new int[position + 1][32]; + System.arraycopy(higherLevelBitmaskByPosition, 0, newArray, 0, higherLevelBitmaskByPosition.length); + higherLevelBitmaskByPosition = newArray; + } + } + + /** + * Seals the registry, preventing further registrations. + * This is useful to ensure that the bitmask does not change after a certain point, + * allowing for safe concurrent access. + */ + public synchronized void seal() { + if (sealed) { + throw new IllegalStateException("Registry is already sealed"); + } + combinedBitmask = accumulativeBitmask; + sealed = true; + } + + /** + * Checks if the registry is sealed. + * + * @return true if the registry is sealed, false otherwise + */ + @SuppressWarnings("BooleanMethodIsAlwaysInverted") + public boolean isSealed() { + return sealed; + } + + /** + * NOTE: not an optimized API + * Gets the bit index associated with the given type. + * + * @param type the type to look up + * @return the bit index for the given type + * @throws IllegalArgumentException if the type is not registered + */ + @NonOptimizedAPI + public int getBitIndex(T type) { + Integer bitmask = typeToBitmask.get(type); + if (bitmask == null) { + throw new IllegalArgumentException("Type is not registered: " + type); + } + return Integer.numberOfTrailingZeros(bitmask); + } + + /** + * Returns an unmodifiable collection with a view of all registered types. + * @return an unmodifiable collection view of all registered types + */ + @NonOptimizedAPI + public Collection getAllRegisteredTypes() { + return Set.copyOf(typeToBitmask.keySet()); + } + + /** + * NOTE: not an optimized API + * Gets the bitmask associated with the given type. + * + * @param type the type to look up + * @return the bitmask for the type + * @throws IllegalArgumentException if the type is not registered + */ + @NonOptimizedAPI + public int getBitmask(T type) { + Integer bitmask = typeToBitmask.get(type); + if (bitmask == null) { + throw new IllegalArgumentException("Type is not registered: " + type); + } + return bitmask; + } + + /** + * An optimized API to get the combined higher level bitmask that corresponds to the given bitmask and position. + * The provided bitmask represents all valid types in this level, and the position indicates is used to look up which higher level + * bits are valid for the given position through {@link ParsingType#getHigherLevelBitmaskByPosition(int)}. + * For example, if this is a sub-token type registry, this method will return the combined bitmask of all token types that are valid + * for all sub-tokens types represented by the given bitmask at the specified position. + * + * @param bitmask the bitmask representing the valid types in this level + * @param position the position of this level instance within its higher-level entity + * @return the combined higher level bitmask for the given bitmask and position + */ + @OptimizedAPI + public int getHigherLevelBitmaskByPositionOld(int bitmask, int position) { + int resultBitmask = 0; + int currentBitIndex = 0; + while (bitmask != 0) { + if ((bitmask & 1) != 0) { + resultBitmask |= typesByBitIndex[currentBitIndex].getHigherLevelBitmaskByPosition(position); + } + bitmask >>>= 1; + currentBitIndex++; + } + return resultBitmask; + } + + @OptimizedAPI + public int getHigherLevelBitmaskByPosition(int bitmask, int position) { + int[] higherLevelBitmaskForPosition = higherLevelBitmaskByPosition[position]; + int resultBitmask = 0; + int currentBitIndex = 0; + while (bitmask != 0) { + if ((bitmask & 1) != 0) { + // if the rightmost bit is set - update the higher-level bitmask for the current bit index + resultBitmask |= higherLevelBitmaskForPosition[currentBitIndex]; + } + bitmask >>>= 1; + currentBitIndex++; + } + return resultBitmask; + } + + /** + * NOTE: not an optimized API + * Gets the bitmask associated with the given type name. + * + * @param name the name of the type to look up + * @return the bitmask for the type with the given name + * @throws IllegalArgumentException if no type is registered with the given name + */ + @NonOptimizedAPI + public int getBitmask(String name) { + for (Map.Entry entry : typeToBitmask.entrySet()) { + if (entry.getKey().name().equals(name)) { + return entry.getValue(); + } + } + throw new IllegalArgumentException("No type is registered with name: " + name); + } + + /** + * Returns a bitmask with all bits corresponding to registered types turned on. + * + * @return the combined bitmask of all registered types + */ + @OptimizedAPI + public int getCombinedBitmask() { + return combinedBitmask; + } + + /** + * NOTE: not an optimized API + * Retrieves the type associated with the given name. + * + * @param name the name of the type to look up + * @return the type associated with the given name + * @throws IllegalArgumentException if no type is registered with the given name + */ + @NonOptimizedAPI + public T getTypeByName(final String name) { + for (T type : typeToBitmask.keySet()) { + if (type.name().equals(name)) { + return type; + } + } + throw new IllegalArgumentException("No type is registered with name: " + name); + } + + /** + * Optimized API for retrieving the type associated with the given bit index. + * + * @param bitIndex the bit index to look up + * @return the type associated with the given bit index + */ + @OptimizedAPI + public T getTypeByBitIndex(final int bitIndex) { + return typesByBitIndex[bitIndex]; + } + + /** + * Optimized API to get the leftmost (the highest priority) bit's index from the given bitmask. + * @param bitmask the bitmask to examine + * @return the index of the leftmost bit + */ + @OptimizedAPI + public static int getLeftmostBitIndex(final int bitmask) { + return 31 - Integer.numberOfLeadingZeros(bitmask); + } + + /** + * Optimized API to get the type associated with the leftmost (the highest priority) bit in the given bitmask. + * This method expects a non-zero bitmask, otherwise it will throw an exception. + * + * @param bitmask the bitmask to examine + * @return the type associated with the leftmost bit in the bitmask + */ + @OptimizedAPI + public T getHighestPriorityType(final int bitmask) { + return typesByBitIndex[getLeftmostBitIndex(bitmask)]; + } + + /** + * Optimized API to get the unique type associated with the given bitmask. + * If the bitmask doesn't have exactly one bit set, it will return {@code null}. + * + * @param bitmask the bitmask to examine + * @return the unique type associated with the given bitmask, or {@code null} if the bitmask doesn't have exactly one bit set + */ + @OptimizedAPI + public T getUniqueType(final int bitmask) { + if (bitmask == 0) { + return null; + } + if ((bitmask & (bitmask - 1)) != 0) { + // more than one bit is set + return null; + } + return getHighestPriorityType(bitmask); + } + + /** + * Returns the number of types registered in this registry. + * + * @return the number of registered types + */ + public int size() { + return typeToBitmask.size(); + } + + /** + * An annotation to indicate optimized APIs. + * This annotation is retained only at compile time. + */ + @Retention(RetentionPolicy.CLASS) + @Target(ElementType.METHOD) + public @interface OptimizedAPI { + // This annotation is used to mark methods that are optimized for performance. + // Make sure to use only methods annotated with this during the parsing process. + } + + /** + * An annotation to indicate NON-optimized APIs. + * This annotation is retained only at compile time. + */ + @Retention(RetentionPolicy.CLASS) + @Target(ElementType.METHOD) + public @interface NonOptimizedAPI { + // This annotation is used to mark methods that are not optimized for performance. + // Refrain from using these methods during the parsing process. + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharParser.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharParser.java new file mode 100644 index 0000000000000..556c38c0fb35b --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharParser.java @@ -0,0 +1,708 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Argument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.DoubleArgument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.HexadecimalArgument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IPv4Argument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IntegerArgument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.KeywordArgument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParseException; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Sign; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Timestamp; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.UUIDArgument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler.CompiledSchema; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.function.ToIntFunction; + +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.DIGIT_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.LINE_END_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.SUBTOKEN_DELIMITER_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.TOKEN_BOUNDARY_CHAR_CODE; +import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.TOKEN_DELIMITER_CHAR_CODE; + +/** + * During parsing, the different current bitmasks represent a superset of all applicable types for the currently parsed entity (subToken, + * token, or multi-token). + * Entity types are excluded through elimination, meaning that the bitmasks are reset to contain all possible entity types at the start of + * each entity parsing, and then are updated through AND operations. + */ +@SuppressWarnings("ExtractMethodRecommender") +public final class CharParser implements Parser { + + // this is the compiled schema information + private final CompiledSchema compiledSchema; + + private final BitmaskRegistry subTokenBitmaskRegistry; + private final BitmaskRegistry tokenBitmaskRegistry; + private final BitmaskRegistry multiTokenBitmaskRegistry; + + // a fast lookup table for subToken bitmasks based on the character + private final int[] charToSubTokenBitmask; + private final int numUsedCharacters; + + // a fast lookup table for character types based on the character + private final byte[] charToCharType; + + private final CharSpecificParsingInfo[] charSpecificParsingInfos; + private final SubstringToIntegerMap subTokenNumericValueRepresentationMap; + + // a fast lookup table for all valid multi-token bitmasks based on the length of the concatenated delimiter parts + private final int[] delimiterPartsLengthToMultiTokenBitmask; + + // special bitmasks + private final int intSubTokenBitmask; + private final int genericSubTokenTypesBitmask; + private final int allSubTokenBitmask; + private final int allTokenBitmask; + private final int allMultiTokenBitmask; + private final int maxSubTokensPerMultiToken; + private final int maxTokensPerMultiToken; + private final int smallIntegerSubTokenUpperBound; + + // current subToken state + private int currentSubTokenStartIndex; + private int currentSubTokenBitmask; + private int currentSubTokenIntValue; + private boolean isCurSubTokenContainsDigits; + private int currentSubTokenPrefixEndIndex; + private int currentSubTokenSuffixStartIndex; + private Sign currentSubTokenSignPrefix; + + // current token state + private int currentTokenBitmask; + private int currentTokenStartIndex; + private int currentTokenSubTokenStartIndex; + private int currentTokenSubTokenIndex; + private boolean isPotentialDecimalNumber; + + // current multi-token state + private int currentMultiTokenStartIndex; + private int currentMultiTokenBitmask; + private int currentDelimiterPartPosition; + + // sub-token buffers + private int bufferedSubTokensIndex; + private final int[] bufferedSubTokenBitmasks; + private final int[] bufferedSubTokenIntValues; + private final Sign[] bufferedSubTokenSigns; + private final int[] bufferedSubTokenStartIndexes; + private final int[] bufferedSubTokenLengths; + + // token buffers + private int bufferedTokensIndex; + private final TokenType[] bufferedTokens; + private final int[] bufferedTokenStartIndexes; + private final int[] bufferedTokenLengths; + private final boolean[] isBufferedTokenDecimalNumber; + // the index of the first sub-token and number of sub-tokens for each buffered token + private final int[] bufferedTokenSubTokenFirstIndexes; + private final int[] bufferedTokenSubTokenLastIndexes; + + public CharParser(CompiledSchema compiledSchema) { + this.compiledSchema = compiledSchema; + this.subTokenBitmaskRegistry = compiledSchema.subTokenBitmaskRegistry; + this.tokenBitmaskRegistry = compiledSchema.tokenBitmaskRegistry; + this.multiTokenBitmaskRegistry = compiledSchema.multiTokenBitmaskRegistry; + this.charToSubTokenBitmask = compiledSchema.charToSubTokenBitmask; + this.numUsedCharacters = this.charToSubTokenBitmask.length; + this.charToCharType = compiledSchema.charToCharType; + this.charSpecificParsingInfos = compiledSchema.charSpecificParsingInfos; + this.subTokenNumericValueRepresentationMap = compiledSchema.subTokenNumericValueRepresentation; + this.delimiterPartsLengthToMultiTokenBitmask = compiledSchema.delimiterPartsTotalLengthToMultiTokenBitmask; + this.intSubTokenBitmask = compiledSchema.intSubTokenBitmask; + this.genericSubTokenTypesBitmask = compiledSchema.genericSubTokenTypesBitmask; + this.allSubTokenBitmask = subTokenBitmaskRegistry.getCombinedBitmask(); + this.allTokenBitmask = tokenBitmaskRegistry.getCombinedBitmask(); + this.allMultiTokenBitmask = multiTokenBitmaskRegistry.getCombinedBitmask(); + this.maxSubTokensPerMultiToken = compiledSchema.maxSubTokensPerMultiToken; + this.maxTokensPerMultiToken = compiledSchema.maxTokensPerMultiToken; + this.smallIntegerSubTokenUpperBound = compiledSchema.smallIntegerSubTokenBitmasks.length; + + bufferedSubTokenBitmasks = new int[maxSubTokensPerMultiToken]; + bufferedSubTokenIntValues = new int[maxSubTokensPerMultiToken]; + bufferedSubTokenSigns = new Sign[maxSubTokensPerMultiToken]; + bufferedSubTokenStartIndexes = new int[maxSubTokensPerMultiToken]; + bufferedSubTokenLengths = new int[maxSubTokensPerMultiToken]; + + bufferedTokens = new TokenType[maxTokensPerMultiToken]; + bufferedTokenStartIndexes = new int[maxTokensPerMultiToken]; + bufferedTokenLengths = new int[maxTokensPerMultiToken]; + isBufferedTokenDecimalNumber = new boolean[maxTokensPerMultiToken]; + bufferedTokenSubTokenFirstIndexes = new int[maxTokensPerMultiToken]; + bufferedTokenSubTokenLastIndexes = new int[maxTokensPerMultiToken]; + } + + private void resetSubTokenState() { + currentSubTokenBitmask = allSubTokenBitmask; + currentSubTokenStartIndex = -1; + currentSubTokenIntValue = 0; + isCurSubTokenContainsDigits = false; + currentSubTokenPrefixEndIndex = -1; + currentSubTokenSuffixStartIndex = -1; + currentSubTokenSignPrefix = null; + } + + private void resetTokenState() { + currentTokenBitmask = allTokenBitmask; + currentTokenStartIndex = -1; + currentTokenSubTokenStartIndex = -1; + currentTokenSubTokenIndex = -1; + isPotentialDecimalNumber = false; + } + + private void resetMultiTokenState() { + currentMultiTokenBitmask = allMultiTokenBitmask; + currentMultiTokenStartIndex = -1; + currentDelimiterPartPosition = -1; + } + + private void resetBuffers() { + // no need to actually reset buffers, enough to reset the indexes + bufferedSubTokensIndex = -1; + bufferedTokensIndex = -1; + } + + private void reset() { + resetSubTokenState(); + resetTokenState(); + resetMultiTokenState(); + resetBuffers(); + } + + /** + * Parses a raw text message and extracts an ordered list of typed arguments. + * + *

The algorithm operates on three levels: sub-tokens → tokens → multi-tokens, using bitmasks to track + * which entity types remain valid as parsing progresses. Each level eliminates invalid possibilities + * through intersection (AND) operations on bitmasks. + * + *

Overall Algorithm: + *

    + *
  1. Initialize all bitmasks to include all possible entity types
  2. + *
  3. Process each character, updating current sub-token state and bitmasks
  4. + *
  5. On delimiter encounters, finalize entities in order: sub-token → token → multi-token
  6. + *
  7. When entities become invalid (bitmask = 0), flush buffered content as pattern text or arguments
  8. + *
  9. Continue until end of message, then create final PatternedMessage
  10. + *
+ * + *

Entity Finalization Process: + * + *

Sub-token Finalization (on any delimiter): + *

    + *
  • Validate sub-token length and content against schema constraints
  • + *
  • For integer sub-tokens: lookup bitmask based on numeric value
  • + *
  • For string sub-tokens: evaluate against constraint functions
  • + *
  • Update token bitmask based on sub-token validity and position
  • + *
  • Buffer sub-token data for potential token/multi-token creation
  • + *
+ * + *

Token Finalization (on token/line delimiters): + *

    + *
  • Validate token against sub-token count constraints
  • + *
  • If valid: determine token type, buffer for multi-token evaluation
  • + *
  • Update multi-token bitmask based on token type and position
  • + *
  • Copy sub-token values to multi-token buffer for timestamp creation
  • + *
+ * + *

Multi-token Finalization (when bitmask becomes invalid): + *

    + *
  • Attempt to create multi-token from buffered tokens (e.g., timestamps)
  • + *
  • If successful: create typed argument, add to result
  • + *
  • If failed: process each buffered token individually as arguments
  • + *
  • Process remaining sub-tokens as either typed arguments or literal text
  • + *
+ * + *

Bitmask Strategy: + *

Each entity maintains a bitmask representing valid types. As parsing progresses: + *

    + *
  • Character validation: {@code bitmask &= charToSubTokenBitmask[char]}
  • + *
  • Position validation: {@code bitmask &= validTypesForPosition[position]}
  • + *
  • Constraint validation: {@code bitmask &= constraintEvaluationResult}
  • + *
  • When bitmask becomes 0, the entity is invalid and triggers buffer flushing
  • + *
+ * + * @param rawMessage the input message to parse + * @return an ordered list of typed arguments extracted from the message + */ + @SuppressWarnings("fallthrough") + public List> parse(String rawMessage) throws ParseException { + if (rawMessage == null || rawMessage.isEmpty()) { + return Collections.emptyList(); + } + reset(); + SubstringView substringView = new SubstringView(rawMessage); + List> templateArguments = new ArrayList<>(); + for (int indexWithinRawMessage = 0; indexWithinRawMessage <= rawMessage.length(); indexWithinRawMessage++) { + byte charType; + char currentChar; + if (indexWithinRawMessage == rawMessage.length()) { + currentChar = ' '; + charType = LINE_END_CODE; + } else { + currentChar = rawMessage.charAt(indexWithinRawMessage); + charType = charToCharType[currentChar]; + } + + if (currentSubTokenStartIndex < 0) { + currentSubTokenStartIndex = indexWithinRawMessage; + } + + // The following check may break when dealing with non-ASCII characters, specifically for code points in the range + // 0xD800 to 0xDFFF that are used for surrogate pairs (four bytes) in UTF-16. For now, we assume we only allow ASCII characters + // in the schema settings. + if (currentChar > numUsedCharacters) { + // in the future we may want to handle non-ASCII characters, but for now we treat current tokens are simple text + currentSubTokenBitmask = currentTokenBitmask = currentMultiTokenBitmask = 0; + } + + currentSubTokenBitmask &= charToSubTokenBitmask[currentChar]; + + switch (charType) { + case DIGIT_CHAR_CODE: + isCurSubTokenContainsDigits = true; + currentSubTokenIntValue = currentSubTokenIntValue * 10 + currentChar - '0'; + break; + case SUBTOKEN_DELIMITER_CHAR_CODE: + if (currentChar == '-') { + if (currentSubTokenStartIndex == indexWithinRawMessage) { + currentSubTokenSignPrefix = Sign.MINUS; + // don't treat as a delimiter but as a sign prefix - continue parsing next character + break; + } + } else if (currentChar == '+') { + if (currentSubTokenStartIndex == indexWithinRawMessage) { + currentSubTokenSignPrefix = Sign.PLUS; + // don't treat as a delimiter but as a sign prefix - continue parsing next character + break; + } + } else if (currentChar == '.') { + isPotentialDecimalNumber = currentTokenSubTokenIndex < 0 && (currentSubTokenBitmask & intSubTokenBitmask) != 0; + } + + // everything we need to do at the end of a sub-token we also must do at the end of a token, so we share the logic + // in the next case - fallthrough is intended + case TOKEN_DELIMITER_CHAR_CODE: + // everything we need to do at the end of a token we also must do at the end of a line, so we share the logic + // in the next case - fallthrough is intended + case LINE_END_CODE: + boolean flushBufferedInfo = false; + + // whether we are processing a subToken delimiter or a token delimiter, once we encounter a state that invalidates + // the current parsed entity, we abort and write all buffered info (tokens and subTokens) to the pattern and/or + // as arguments in the right order: multi-token, token, subTokens. + // breaking from this case without writing the buffered info should happen only when we are still within a token + // parsing, or within a multi-token parsing. + + // currentSubTokenEndIndex is exclusive, meaning it points to the first character after the current subToken + int currentSubTokenEndIndex = (currentSubTokenSuffixStartIndex >= 0) + ? currentSubTokenSuffixStartIndex + : indexWithinRawMessage; + int currentSubTokenLength = currentSubTokenEndIndex - currentSubTokenStartIndex; + + if (currentSubTokenLength == 0) { + // empty tokens (for example, just "-") should have a zero bitmask, so we must change from the default non-zero + currentSubTokenBitmask = 0; + } + + currentTokenSubTokenIndex++; + if (currentSubTokenBitmask == 0 || currentTokenSubTokenIndex == compiledSchema.maxSubTokensPerToken) { + // we either already passed the maximum number of subTokens for any known token, or the current subToken is + // invalid - both indication that the current token is invalid + currentTokenBitmask = 0; + } + + if (currentSubTokenSignPrefix == Sign.MINUS) { + currentSubTokenIntValue = -currentSubTokenIntValue; + } + + CharSpecificParsingInfo delimiterParsingInfo = null; + if (currentTokenBitmask == 0) { + // no need to evaluate specific subToken types, the generic type would be enough to create generic arguments + flushBufferedInfo = true; + } else { + delimiterParsingInfo = charSpecificParsingInfos[currentChar]; + + // update the current token bitmask to include only valid tokens with the current delimiter character in the + // current subToken position + currentTokenBitmask &= delimiterParsingInfo.tokenBitmaskPerDelimiterPosition[currentTokenSubTokenIndex]; + + // here we enforce subToken specific constraints (numeric or string) to update the current subToken bitmask + if ((currentSubTokenBitmask & intSubTokenBitmask) != 0) { + // integer subToken + if (currentSubTokenIntValue >= 0 && currentSubTokenIntValue < smallIntegerSubTokenUpperBound) { + // faster bitmask lookup for small integers + currentSubTokenBitmask = compiledSchema.smallIntegerSubTokenBitmasks[currentSubTokenIntValue]; + } else { + currentSubTokenBitmask = findBitmaskForInteger( + currentSubTokenIntValue, + compiledSchema.integerSubTokenBitmaskArrayRanges, + compiledSchema.integerSubTokenBitmasks + ); + } + } else { + // general string subToken + ToIntFunction subTokenBitmaskGenerator = null; + if (delimiterParsingInfo.bitmaskGeneratorPerPosition != null) { + subTokenBitmaskGenerator = delimiterParsingInfo.bitmaskGeneratorPerPosition[currentTokenSubTokenIndex]; + } + if (subTokenBitmaskGenerator != null) { + substringView.set(currentSubTokenStartIndex, currentSubTokenEndIndex); + int substringBitmask = subTokenBitmaskGenerator.applyAsInt(substringView); + if (substringBitmask == 0) { + // not a specific subToken, so we keep only the generic subToken types + currentSubTokenBitmask &= genericSubTokenTypesBitmask; + } else { + // the subToken is valid, so we set the bitmask to the evaluated value + currentSubTokenBitmask &= substringBitmask; + currentSubTokenIntValue = subTokenNumericValueRepresentationMap.applyAsInt(substringView); + } + } else { + // no bitmask generator for this subToken, meaning no known token expects this delimiter character + // at this position + currentSubTokenBitmask = 0; + } + } + + // update the current token bitmask based on all "on" bits in the current sub-token bitmask + currentTokenBitmask &= subTokenBitmaskRegistry.getHigherLevelBitmaskByPosition( + currentSubTokenBitmask, + currentTokenSubTokenIndex + ); + } + + // buffer the current subToken info + if (currentSubTokenBitmask != 0) { + bufferedSubTokensIndex++; + bufferedSubTokenBitmasks[bufferedSubTokensIndex] = currentSubTokenBitmask; + bufferedSubTokenStartIndexes[bufferedSubTokensIndex] = currentSubTokenStartIndex; + bufferedSubTokenIntValues[bufferedSubTokensIndex] = currentSubTokenIntValue; + bufferedSubTokenLengths[bufferedSubTokensIndex] = currentSubTokenLength; + bufferedSubTokenSigns[bufferedSubTokensIndex] = currentSubTokenSignPrefix; + if (bufferedSubTokensIndex == maxSubTokensPerMultiToken - 1) { + // we are at the maximum number of subTokens for any known multi-token so we must flush the buffered info + flushBufferedInfo = true; + } + } + + if (currentTokenStartIndex < 0) { + // ending the first sub-token of the current token + currentTokenStartIndex = currentSubTokenStartIndex; + currentTokenSubTokenStartIndex = bufferedSubTokensIndex; + } + + boolean finalizeMultiToken = false; + if (charType == TOKEN_DELIMITER_CHAR_CODE || charType == LINE_END_CODE) { + int currentTokenLength = currentSubTokenEndIndex - currentTokenStartIndex; + if (currentTokenLength == 0) { + // empty tokens (consecutive white spaces) should have a zero bitmask, so we must change from the default + // non-zero + currentTokenBitmask = 0; + } + + // eliminate token types with the wrong number of subTokens/tokens + currentTokenBitmask &= compiledSchema.subTokenCountToTokenBitmask[currentTokenSubTokenIndex]; + + if (currentTokenBitmask != 0) { + TokenType currentToken = tokenBitmaskRegistry.getHighestPriorityType(currentTokenBitmask); + bufferedTokensIndex++; + bufferedTokens[bufferedTokensIndex] = currentToken; + bufferedTokenStartIndexes[bufferedTokensIndex] = currentTokenStartIndex; + bufferedTokenLengths[bufferedTokensIndex] = currentTokenLength; + bufferedTokenSubTokenFirstIndexes[bufferedTokensIndex] = currentTokenSubTokenStartIndex; + bufferedTokenSubTokenLastIndexes[bufferedTokensIndex] = bufferedSubTokensIndex; + + isBufferedTokenDecimalNumber[bufferedTokensIndex] = isPotentialDecimalNumber + && currentToken.encodingType() == EncodingType.DOUBLE + && currentTokenSubTokenIndex == 1 + && (currentSubTokenBitmask & intSubTokenBitmask) != 0; + + if (bufferedTokensIndex == 0) { + currentMultiTokenStartIndex = currentTokenStartIndex; + } else if (bufferedTokensIndex == maxTokensPerMultiToken - 1) { + // we reached the maximum number of tokens for any known multi-token, so we must flush the buffered info + flushBufferedInfo = true; + } + + // update the current multi-token bitmask based on the current token + currentMultiTokenBitmask &= tokenBitmaskRegistry.getHigherLevelBitmaskByPosition( + currentTokenBitmask, + bufferedTokensIndex + ); + + if (currentMultiTokenBitmask != 0) { + // finalizing the current delimiter part, which would provide the indication whether it is + // time to finalize the current multi-token + int tmpDelimiterPartPosition = currentDelimiterPartPosition; + int tmpMultiTokenBitmask = currentMultiTokenBitmask; + if (currentSubTokenSuffixStartIndex >= 0) { + for (int i = currentSubTokenSuffixStartIndex; i < indexWithinRawMessage; i++) { + tmpDelimiterPartPosition++; + CharSpecificParsingInfo suffixCharParsingInfo = charSpecificParsingInfos[rawMessage.charAt(i)]; + if (suffixCharParsingInfo != null) { + int[] mtb2dpp = suffixCharParsingInfo.multiTokenBitmaskPerDelimiterPartPosition; + if (mtb2dpp != null && tmpDelimiterPartPosition < mtb2dpp.length) { + tmpMultiTokenBitmask &= mtb2dpp[tmpDelimiterPartPosition]; + } else { + tmpMultiTokenBitmask = 0; + } + } else { + tmpMultiTokenBitmask = 0; + } + } + } + tmpDelimiterPartPosition++; + if (delimiterParsingInfo.multiTokenBitmaskPerDelimiterPartPosition != null + && tmpDelimiterPartPosition < delimiterParsingInfo.multiTokenBitmaskPerDelimiterPartPosition.length) { + tmpMultiTokenBitmask &= + delimiterParsingInfo.multiTokenBitmaskPerDelimiterPartPosition[tmpDelimiterPartPosition]; + } else { + tmpMultiTokenBitmask = 0; + } + + if (tmpMultiTokenBitmask != 0) { + // we are still within a valid multi-token parsing, so we can proceed to parse the next token + currentMultiTokenBitmask = tmpMultiTokenBitmask; + currentDelimiterPartPosition = tmpDelimiterPartPosition; + + if (charType == LINE_END_CODE) { + // end of line reached - time to finalize the multi-token + finalizeMultiToken = true; + } + } else { + if (bufferedTokensIndex > 0) { + // having more that one token buffered and the switch from non-zero multi-token bitmask to zero + // indicates that we should try to finalize the multi-token now + finalizeMultiToken = true; + } + flushBufferedInfo = true; + } + } else { + flushBufferedInfo = true; + } + } else { + currentMultiTokenBitmask = 0; + flushBufferedInfo = true; + } + resetTokenState(); + } + + if (flushBufferedInfo || charType == LINE_END_CODE) { + int flushedTokens = 0; + int flushedSubTokens = 0; + if (finalizeMultiToken) { + // eliminate multi-token types with the wrong number of tokens or subTokens + currentMultiTokenBitmask &= compiledSchema.tokenCountToMultiTokenBitmask[bufferedTokensIndex]; + currentMultiTokenBitmask &= compiledSchema.subTokenCountToMultiTokenBitmask[bufferedSubTokensIndex]; + + // eliminate multi-token types with the wrong delimiter parts total length + if (currentDelimiterPartPosition < delimiterPartsLengthToMultiTokenBitmask.length) { + currentMultiTokenBitmask &= delimiterPartsLengthToMultiTokenBitmask[currentDelimiterPartPosition]; + } else { + currentMultiTokenBitmask = 0; + } + + // todo - instead of all these thrown exceptions, we should log errors and continue parsing the best we can + if (currentMultiTokenBitmask != 0) { + MultiTokenType multiTokenType = multiTokenBitmaskRegistry.getUniqueType(currentMultiTokenBitmask); + if (multiTokenType != null) { + Argument argument = null; + if (multiTokenType.getNumSubTokens() != bufferedSubTokensIndex + 1) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Multi-token type %s expects %d subTokens, but there are only %d buffered subTokens", + multiTokenType.name(), + multiTokenType.getNumSubTokens(), + bufferedSubTokensIndex + 1 + ) + ); + } + if (multiTokenType.encodingType() == EncodingType.TIMESTAMP) { + int currentMultiTokenEndIndex = bufferedTokenStartIndexes[bufferedTokensIndex] + + bufferedTokenLengths[bufferedTokensIndex]; + int multiTokenLength = currentMultiTokenEndIndex - currentMultiTokenStartIndex; + long timestampMillis = multiTokenType.getTimestampFormat().toTimestamp(bufferedSubTokenIntValues); + argument = new Timestamp( + currentMultiTokenStartIndex, + multiTokenLength, + timestampMillis, + multiTokenType.getTimestampFormat().getJavaTimeFormat() + ); + } else { + throw new ParseException("Unknown multi-token type: " + multiTokenType.name()); + } + if (argument != null) { + templateArguments.addLast(argument); + // multi-token generation consumes all buffered tokens and sub-tokens + flushedTokens = bufferedTokensIndex + 1; + flushedSubTokens = bufferedSubTokensIndex + 1; + } + } else { + throw new ParseException("Ambiguous multi-token type, schema should be changed"); + } + } else { + // todo - invalid multi-token, we need to recalculate the buffered tokens and check if any sequence + // of them can form a valid multi-token. We should still + } + resetMultiTokenState(); + } + + // write each buffered token as a pattern argument + for (int i = flushedTokens; i <= bufferedTokensIndex; i++) { + TokenType tokenType = bufferedTokens[i]; + Argument argument = switch (tokenType.encodingType) { + case TIMESTAMP -> { + if (i > 0) { + // if we are here, it means that a valid multi-token timestamp was not formed and timestamp tokens + // cannot be non-first in a multi-token + throw new ParseException( + "Timestamp token cannot be the non-first in a multi-token, but found at position " + i + ); + } + long timestampMillis = tokenType.getTimestampFormat().toTimestamp(bufferedSubTokenIntValues); + yield new Timestamp( + bufferedTokenStartIndexes[i], + bufferedTokenLengths[i], + timestampMillis, + tokenType.getTimestampFormat().getJavaTimeFormat() + ); + } + case INTEGER -> { + int integerSubTokenIndex = bufferedTokenSubTokenFirstIndexes[i]; + yield new IntegerArgument( + bufferedTokenStartIndexes[i], + bufferedTokenLengths[i], + bufferedSubTokenIntValues[integerSubTokenIndex], + bufferedSubTokenSigns[integerSubTokenIndex] + ); + } + case DOUBLE -> { + if (isBufferedTokenDecimalNumber[i]) { + // an optimization for simple decimal numbers - if we are here, it means that + // this token contains a single decimal point and two integer sub-tokens + int firstSubTokenIndex = bufferedTokenSubTokenFirstIndexes[i]; + int fractionalSubTokenLength = bufferedSubTokenLengths[firstSubTokenIndex + 1]; + int fractionalSubTokenIntValue = bufferedSubTokenIntValues[firstSubTokenIndex + 1]; + if (bufferedSubTokenSigns[firstSubTokenIndex] == Sign.MINUS) { + fractionalSubTokenIntValue = -fractionalSubTokenIntValue; + } + double doubleValue = bufferedSubTokenIntValues[firstSubTokenIndex] + fractionalSubTokenIntValue + / Math.pow(10, fractionalSubTokenLength); + yield new DoubleArgument(bufferedTokenStartIndexes[i], bufferedTokenLengths[i], doubleValue); + } else { + // parse as a general double argument + yield new DoubleArgument(rawMessage, bufferedTokenStartIndexes[i], bufferedTokenLengths[i]); + } + } + case HEX -> new HexadecimalArgument(rawMessage, bufferedTokenStartIndexes[i], bufferedTokenLengths[i]); + case IPV4 -> new IPv4Argument( + bufferedTokenStartIndexes[i], + bufferedTokenLengths[i], + bufferedSubTokenIntValues, + bufferedTokenSubTokenFirstIndexes[i] + ); + case UUID -> new UUIDArgument(rawMessage, bufferedTokenStartIndexes[i], bufferedTokenLengths[i]); + // todo - add support for local time arguments, relying on java.time.LocalTime + default -> null; + }; + if (argument != null) { + templateArguments.addLast(argument); + flushedTokens++; + flushedSubTokens = bufferedTokenSubTokenLastIndexes[i] + 1; + } + } + + // for each buffered subToken: if the subToken bitmask is !=0, add placeholder to the pattern and add the + // subToken as an argument, + // else write the subToken as is to the pattern, then add the corresponding delimiter character to the pattern + for (int i = flushedSubTokens; i <= bufferedSubTokensIndex; i++) { + Argument argument = null; + // this is not about specific subToken types, here we are only dealing with generic subToken types, so we + // must switch off all specific subToken types, otherwise they have precedence + int subTokenBitmask = bufferedSubTokenBitmasks[i] & genericSubTokenTypesBitmask; + if (subTokenBitmask != 0) { + SubTokenType subTokenType = subTokenBitmaskRegistry.getHighestPriorityType(subTokenBitmask); + argument = switch (subTokenType.encodingType) { + case INTEGER -> new IntegerArgument( + bufferedSubTokenStartIndexes[i], + bufferedSubTokenLengths[i], + bufferedSubTokenIntValues[i], + bufferedSubTokenSigns[i] + ); + case DOUBLE -> new DoubleArgument( + rawMessage, + bufferedSubTokenStartIndexes[i], + bufferedSubTokenLengths[i] + ); + case HEX -> new HexadecimalArgument( + rawMessage, + bufferedSubTokenStartIndexes[i], + bufferedSubTokenLengths[i] + ); + default -> null; + }; + } else if (isCurSubTokenContainsDigits) { + argument = new KeywordArgument(rawMessage, bufferedSubTokenStartIndexes[i], bufferedSubTokenLengths[i]); + } + + if (argument != null) { + templateArguments.addLast(argument); + } + } + resetTokenState(); + resetBuffers(); + } + resetSubTokenState(); + break; + case TOKEN_BOUNDARY_CHAR_CODE: + if (currentSubTokenStartIndex == indexWithinRawMessage) { + // this is a sub-token prefix + currentSubTokenPrefixEndIndex = indexWithinRawMessage; + currentSubTokenStartIndex = -1; + } else if (currentSubTokenSuffixStartIndex < 0) { + // this is the first sub-token boundary suffix character + currentSubTokenSuffixStartIndex = indexWithinRawMessage; + } + break; + default: + } + } + return templateArguments; + } + + /** + * Finds the bitmask for the current integer value in the compiled schema through binary search. + * See {@link CompiledSchema#integerSubTokenBitmaskArrayRanges} for details. + * This method is separated so that it would be easily testable. It is very likely to be inlined by the JIT compiler, but consider + * adding it directly to the parsing loop. + * This algorithm assumest that the last entry in the integerSubTokenBitmaskArrayRanges is always {@link Integer#MAX_VALUE}. + * @param value the integer value to find the bitmask for + * @return the bitmask for the given integer value. + */ + static int findBitmaskForInteger(final int value, final int[] integerSubTokenBitmaskArrayRanges, final int[] integerSubTokenBitmasks) { + int low = 0; + int high = integerSubTokenBitmaskArrayRanges.length - 1; + while (low < high) { + int mid = (low + high) / 2; + if (integerSubTokenBitmaskArrayRanges[mid] < value) { + low = mid + 1; + } else { + high = mid; + } + } + return integerSubTokenBitmasks[low]; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharSpecificParsingInfo.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharSpecificParsingInfo.java new file mode 100644 index 0000000000000..0c0f1fcebe162 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharSpecificParsingInfo.java @@ -0,0 +1,58 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import java.util.function.ToIntFunction; + +/** + * An immutable class that holds information about a specific character-related parsing details (e.g., a subToken delimiter character). + */ +public final class CharSpecificParsingInfo { + + /** + * The that this info instance is responsible for. + */ + public final char character; + + /** + * This array indicates what token bitmask is valid for the delimiter character represented by the current instance at each position + * between two subTokens of the parsed token. For example, the bitmask at index 2 indicates what tokens + * are valid if the current delimiter character is found between the third and fourth subTokens of the currently parsed token. + */ + public final int[] tokenBitmaskPerDelimiterPosition; + + /** + * Provides a way to generate a sub-token bitmask based on a substring for each location within a token. + * This array contains instances of {@link ToIntFunction} for each subToken index, specific to the delimiter represented by the + * current instance. The index in the array corresponds to the index of the sub-token within the parsed token. + * For example, the generator function at index 2 is used to generate the bitmask of all valid sub-tokens that are the third sub-token + * in the parsed token, based on the substring found between the second and third occurrence of the delimiter character. + */ + public final ToIntFunction[] bitmaskGeneratorPerPosition; + + /** + * This array indicates what multi-token bitmask is valid when the character represented by the current instance is found at each + * position within the total length of multi-token delimiter parts. + * For example, given the following multi-token format: "$Mon, $DD $YYYY $timeS $AP", the full concatenated string made up of the + * delimiter parts is ", ". Therefore, the bit of this multi-token will be set at index 0 for the character ',' and at indices 1, + * 2, 3, and 4 for the space character. This enables exact match of multi-token formats. + */ + public final int[] multiTokenBitmaskPerDelimiterPartPosition; + + public CharSpecificParsingInfo( + char character, + int[] tokenBitmaskPerDelimiterPosition, + ToIntFunction[] bitmaskGeneratorPerPosition, + int[] multiTokenBitmaskPerDelimiterPartPosition + ) { + this.character = character; + this.tokenBitmaskPerDelimiterPosition = tokenBitmaskPerDelimiterPosition; + this.bitmaskGeneratorPerPosition = bitmaskGeneratorPerPosition; + this.multiTokenBitmaskPerDelimiterPartPosition = multiTokenBitmaskPerDelimiterPartPosition; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/MultiTokenType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/MultiTokenType.java new file mode 100644 index 0000000000000..a44118cfba3b3 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/MultiTokenType.java @@ -0,0 +1,17 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +public final class MultiTokenType extends ParsingType { + + public MultiTokenType(String name, EncodingType encodingType, int numSubTokens, TimestampFormat timestampFormat) { + super(name, encodingType, numSubTokens, timestampFormat, null); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/ParsingType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/ParsingType.java new file mode 100644 index 0000000000000..0066a1ad5bf15 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/ParsingType.java @@ -0,0 +1,91 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.Type; + +import java.util.Locale; + +public abstract class ParsingType implements Type { + + protected final String name; + protected final EncodingType encodingType; + + /** + * The number of sub-tokens that this parsing type is composed of. + * For example, a token type that represents a date would typically be composed of 3 sub-tokens: year, month, and day. + * A multi-token type that represents a full timestamp would typically be composed of at least 6 timestamp components: year, month, day, + * hour, minute, and second. + * Sub-token types will always have numSubTokens = 1. + */ + private final int numSubTokens; + + protected final TimestampFormat timestampFormat; + + /** + * Higher-level entity bitmask for this parsing type by position. + * For example, if this parsing type is a sub-token, the bitmask at position 2 will indicate + * the token types in which this sub-token can be found at the third position. + */ + protected final int[] higherLevelBitmaskByPosition; + + protected ParsingType( + String name, + EncodingType encodingType, + int numSubTokens, + TimestampFormat timestampFormat, + int[] higherLevelBitmaskByPosition + ) { + if (timestampFormat == null) { + if (encodingType == EncodingType.TIMESTAMP) { + throw new IllegalStateException( + String.format(Locale.ROOT, "Multi-token type %s is a timestamp, but does not have a timestamp format defined", name) + ); + } + } + this.name = name; + this.encodingType = encodingType; + this.numSubTokens = numSubTokens; + this.timestampFormat = timestampFormat; + this.higherLevelBitmaskByPosition = higherLevelBitmaskByPosition; + } + + @Override + public String name() { + return name; + } + + @Override + public EncodingType encodingType() { + return encodingType; + } + + public int getNumSubTokens() { + return numSubTokens; + } + + public TimestampFormat getTimestampFormat() { + return timestampFormat; + } + + /** + * Returns the higher-level entity bitmask for the specified position of this parsing type within its higher-level entity. + * For example, if this parsing type is a sub-token, the bitmask returned when invoking this method with position 2 will indicate + * the token types in which this sub-token can be found at the third position. + * + * @param position the position in the token + * @return the bitmask for the specified position + */ + public int getHigherLevelBitmaskByPosition(int position) { + if (higherLevelBitmaskByPosition == null) { + throw new UnsupportedOperationException("This parsing type does not support higher-level bitmask by position"); + } + return higherLevelBitmaskByPosition[position]; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubTokenType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubTokenType.java new file mode 100644 index 0000000000000..12bdb40b0e371 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubTokenType.java @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.TimestampComponentType; + +public final class SubTokenType extends ParsingType { + final TimestampComponentType timestampComponentType; + + public SubTokenType( + String name, + EncodingType encodingType, + int[] tokenBitmaskByPosition, + TimestampComponentType timestampComponentType + ) { + super(name, encodingType, 1, null, tokenBitmaskByPosition); + this.timestampComponentType = timestampComponentType; + } + + public TimestampComponentType getTimestampComponentType() { + return timestampComponentType; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringToIntegerMap.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringToIntegerMap.java new file mode 100644 index 0000000000000..c14c4a964701a --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringToIntegerMap.java @@ -0,0 +1,132 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.function.ToIntFunction; +import java.util.stream.Collectors; + +/** + * An immutable mapping from SubstringView to integer bitmask values. + * todo: consider optimizations, mostly regarding equality checks of SubstringView keys - either through the equals implementation (e.g. + * failing fast on hashcode mismatch) and/or through making the map sparse (e.g. using a trie or a perfect hash function if the set of + * keys is known to be static and unchanging). + */ +public final class SubstringToIntegerMap implements ToIntFunction { + + private final Map map; + + private SubstringToIntegerMap(Map map) { + this.map = map; + } + + @Override + public int applyAsInt(final SubstringView input) { + Integer value = map.get(input); + // unboxing - no allocation (as opposed to boxing) and most likely ~0 overhead for primitive value retrieval + return value != null ? value : 0; + } + + public boolean isEmpty() { + return map.isEmpty(); + } + + /** + * Builder for creating SubstringToIntMap instances. + */ + public static class Builder { + private final Map map; + + private Builder() { + this.map = new HashMap<>(); + } + + /** + * Get the current value mapped to the given key. + * @param key the string key + * @return the mapped integer value, or null if the key does not exist + */ + public Integer get(String key) { + return map.get(key); + } + + /** + * Check if the map being built is empty. + * @return true if the map is empty, false otherwise + */ + public boolean isEmpty() { + return map.isEmpty(); + } + + /** + * Add an entry to the map being built if such does not already exist. If the key already exists, the mapped bitmask is updated + * by ORing the existing value with the new value. + * @param key the string key + * @param value the integer value + * @return this builder for method chaining + */ + public Builder add(String key, int value) { + map.merge(key, value, (oldVal, newVal) -> oldVal | newVal); + return this; + } + + public Builder addAll(SubstringToIntegerMap other) { + other.map.forEach((key, value) -> this.add(key.toString(), value)); + return this; + } + + /** + * Add all entries from another map to the map being built. See {@link #add(String, int)} for handling of key collisions. + * @param otherMap the map whose entries are to be merged into this builder's map + * @return this builder for method chaining + */ + public Builder addAll(Map otherMap) { + otherMap.forEach(this::add); + return this; + } + + /** + * Add all entries of the given set to the map being built, mapping each string to the given bitmask value. + * If a key already exists, the mapped bitmask is updated by ORing the existing with the new one. + * @param otherKeys the set of strings to be added + * @param bitmask the bitmask value to map each string to + * @return this builder for method chaining + */ + public Builder addAll(Set otherKeys, int bitmask) { + otherKeys.forEach(key -> this.add(key, bitmask)); + return this; + } + + /** + * Build an immutable SubstringToIntMap from the accumulated entries. + * @return a new SubstringToIntMap instance + * @throws IllegalArgumentException if the map is empty + */ + public SubstringToIntegerMap build() { + if (map.isEmpty()) { + throw new IllegalArgumentException("Map cannot be null or empty"); + } + + Map substringViewMap = map.entrySet() + .stream() + .collect(Collectors.toMap(entry -> new SubstringView(entry.getKey()), Map.Entry::getValue)); + + return new SubstringToIntegerMap(Map.copyOf(substringViewMap)); + } + } + + /** + * Creates a new Builder instance. + * @return a new Builder + */ + public static Builder builder() { + return new Builder(); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringView.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringView.java new file mode 100644 index 0000000000000..dba06bcb09082 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringView.java @@ -0,0 +1,101 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +/** + * Provides a substring view based on a given CharSequence, allowing for substring functionality without allocating new string objects. + * The substring's first index is inclusive and the last index is exclusive. + */ +public class SubstringView implements CharSequence { + private CharSequence source; + private int start; + private int end; + private int hashCode; + private boolean hashComputed; + + public SubstringView(CharSequence source) { + this(source, 0, source.length()); + } + + public SubstringView(CharSequence source, int start, int end) { + if (source == null) { + throw new IllegalArgumentException("Source cannot be null"); + } + if (start < 0 || end > source.length() || start > end) { + throw new IndexOutOfBoundsException("Invalid start or end index"); + } + this.source = source; + this.start = start; + this.end = end; + } + + public void set(CharSequence source, int start, int end) { + this.source = source; + set(start, end); + } + + public void set(int start, int end) { + this.start = start; + this.end = end; + this.hashCode = 0; + this.hashComputed = false; + } + + @Override + public int length() { + return end - start; + } + + @Override + public char charAt(int index) { + if (index < 0 || index >= length()) { + throw new IndexOutOfBoundsException("Index out of bounds: " + index); + } + return source.charAt(start + index); + } + + @Override + public CharSequence subSequence(int start, int end) { + throw new UnsupportedOperationException("Not supported to avoid allocations. This IS the subsequence."); + } + + @Override + public String toString() { + var sb = new StringBuilder(length()); + for (int i = start; i < end; i++) { + sb.append(source.charAt(i)); + } + return sb.toString(); + } + + @Override + public int hashCode() { + if (hashComputed == false) { + int h = 1; + for (int i = start; i < end; i++) { + h = 31 * h + source.charAt(i); + } + hashCode = h; + hashComputed = true; + } + return hashCode; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj instanceof SubstringView other) { + if (length() != other.length()) return false; + for (int i = 0; i < length(); i++) { + if (charAt(i) != other.charAt(i)) return false; + } + return true; + } + return false; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/TimestampFormat.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/TimestampFormat.java new file mode 100644 index 0000000000000..87c70d268acab --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/TimestampFormat.java @@ -0,0 +1,190 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.TimestampComponentType; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Locale; + +public final class TimestampFormat { + + private final String javaTimeFormat; + private final DateTimeFormatter dateTimeFormatter; + + /** + * Indicates the order of timestamp components in the format for the eventual calculation of the timestamp. + * This array's length equals to the number of {@link TimestampComponentType} instances. The value in index i in this array represents + * the index of the timestamp component in the input array that corresponds to the i-th component in this timestamp format. + * Whenever the timestamp format does not include a specific component, the corresponding index in this array will be -1. + * For example, if the format is "yyyy-MM-dd HH:mm:ss TZh:TZm", then the order array might look like this: + * [0, 1, 2, 3, -1, 4, 5, -1, -1, -1, 6, 7, -1] and the input array for the timestamp 2025-12-31 15:30:45 +02:00 will be: + * [2025, 12, 31, 15, 0, 30, 45, 0, 0, 0, 2, 0, 0] (see {@link TimestampComponentType} for the meaning of each index). + */ + private final int[] timestampComponentsOrder; + + private final int numTimestampComponents; + + private final int yearIndex; + private final int monthIndex; + private final int dayIndex; + private final int hourIndex; + private final int amPmIndex; + private final int minuteIndex; + private final int secondIndex; + private final int millisecondIndex; + private final int microsecondIndex; + private final int nanosecondIndex; + private final int timezoneOffsetHoursIndex; + private final int timezoneOffsetMinutesIndex; + private final int timezoneOffsetHoursAndMinutesIndex; + + public TimestampFormat(String javaTimeFormat, int[] timestampComponentsOrder) { + this.javaTimeFormat = javaTimeFormat; + boolean amPm = false; + + this.timestampComponentsOrder = timestampComponentsOrder; + int timestampComponentsCount = 0; + this.yearIndex = timestampComponentsOrder[TimestampComponentType.YEAR_CODE]; + if (yearIndex < 0) { + throw new IllegalArgumentException("Timestamp format must include a year component"); + } + timestampComponentsCount++; + this.monthIndex = timestampComponentsOrder[TimestampComponentType.MONTH_CODE]; + if (monthIndex < 0) { + throw new IllegalArgumentException("Timestamp format must include a month component"); + } + timestampComponentsCount++; + this.dayIndex = timestampComponentsOrder[TimestampComponentType.DAY_CODE]; + if (dayIndex < 0) { + throw new IllegalArgumentException("Timestamp format must include a day component"); + } + timestampComponentsCount++; + this.hourIndex = timestampComponentsOrder[TimestampComponentType.HOUR_CODE]; + if (hourIndex < 0) { + throw new IllegalArgumentException("Timestamp format must include an hour component"); + } + timestampComponentsCount++; + this.amPmIndex = timestampComponentsOrder[TimestampComponentType.AM_PM_CODE]; + if (amPmIndex >= 0) { + timestampComponentsCount++; + amPm = true; + } + this.minuteIndex = timestampComponentsOrder[TimestampComponentType.MINUTE_CODE]; + if (minuteIndex < 0) { + throw new IllegalArgumentException("Timestamp format must include a minute component"); + } + timestampComponentsCount++; + this.secondIndex = timestampComponentsOrder[TimestampComponentType.SECOND_CODE]; + if (secondIndex < 0) { + throw new IllegalArgumentException("Timestamp format must include a second component"); + } + timestampComponentsCount++; + this.millisecondIndex = timestampComponentsOrder[TimestampComponentType.MILLISECOND_CODE]; + if (millisecondIndex >= 0) { + timestampComponentsCount++; + } + this.microsecondIndex = timestampComponentsOrder[TimestampComponentType.MICROSECOND_CODE]; + if (microsecondIndex >= 0) { + timestampComponentsCount++; + } + this.nanosecondIndex = timestampComponentsOrder[TimestampComponentType.NANOSECOND_CODE]; + if (nanosecondIndex >= 0) { + timestampComponentsCount++; + } + this.timezoneOffsetHoursIndex = timestampComponentsOrder[TimestampComponentType.TIMEZONE_OFFSET_HOURS_CODE]; + if (timezoneOffsetHoursIndex >= 0) { + timestampComponentsCount++; + } + this.timezoneOffsetMinutesIndex = timestampComponentsOrder[TimestampComponentType.TIMEZONE_OFFSET_MINUTES_CODE]; + if (timezoneOffsetMinutesIndex >= 0) { + timestampComponentsCount++; + } + this.timezoneOffsetHoursAndMinutesIndex = timestampComponentsOrder[TimestampComponentType.TIMEZONE_OFFSET_HOURS_AND_MINUTES_CODE]; + if (timezoneOffsetHoursAndMinutesIndex >= 0) { + timestampComponentsCount++; + } + this.numTimestampComponents = timestampComponentsCount; + + this.dateTimeFormatter = amPm + ? DateTimeFormatter.ofPattern(javaTimeFormat, Locale.US) + : DateTimeFormatter.ofPattern(javaTimeFormat, Locale.ROOT); + } + + public String getJavaTimeFormat() { + return javaTimeFormat; + } + + public int getNumTimestampComponents() { + return numTimestampComponents; + } + + public int[] getTimestampComponentsOrder() { + return timestampComponentsOrder; + } + + public long toTimestamp(int[] parsedTimestampComponents) { + int year, month, day, hour, minute, second, nanos, timezoneOffset; + year = parsedTimestampComponents[yearIndex]; + month = parsedTimestampComponents[monthIndex]; + day = parsedTimestampComponents[dayIndex]; + hour = parsedTimestampComponents[hourIndex]; + // Handle AM/PM if present + if (amPmIndex >= 0) { + int amPmCode = parsedTimestampComponents[amPmIndex]; + if (amPmCode == TimestampComponentType.AM_CODE && hour == 12) { + hour = 0; // 12 AM is midnight + } else if (amPmCode == TimestampComponentType.PM_CODE && hour < 12) { + hour += 12; // Convert PM hour to 24-hour format + } + } + minute = parsedTimestampComponents[minuteIndex]; + second = parsedTimestampComponents[secondIndex]; + if (millisecondIndex >= 0) { + nanos = parsedTimestampComponents[millisecondIndex] * 1_000_000; + } else if (microsecondIndex >= 0) { + nanos = parsedTimestampComponents[microsecondIndex] * 1000; + } else if (nanosecondIndex >= 0) { + nanos = parsedTimestampComponents[nanosecondIndex]; + } else { + nanos = 0; + } + LocalDateTime localDateTime = LocalDateTime.of(year, month, day, hour, minute, second, nanos); + // todo - properly compute timezone offset + // timezoneOffset = parsedTimestampComponents[timestampComponentsOrder[TimestampComponentType.TIMEZONE_OFFSET_HOURS_CODE]]; + timezoneOffset = 0; + // todo - probably better to use java.time.OffsetDateTime.of(int, int, int, int, int, int, int, java.time.ZoneOffset) + // no need to cache the ZoneOffset, as it is already cached in ZoneOffset class + // todo - is it possible to compute the timestamp without any allocation? + return localDateTime.toInstant(java.time.ZoneOffset.ofTotalSeconds(timezoneOffset)).toEpochMilli(); + } + + public long parseTimestamp(String timestampString) { + return parseTimestamp(dateTimeFormatter, timestampString); + } + + public static long parseTimestamp(DateTimeFormatter dateTimeFormatter, String timestampString) { + return dateTimeFormatter.parse(timestampString, LocalDateTime::from) + // todo - handle timezone offset + .toInstant(java.time.ZoneOffset.ofTotalSeconds(0)) + .toEpochMilli(); + } + + public String representAsString(long timestamp) { + return representAsString(dateTimeFormatter, timestamp); + } + + public static String representAsString(DateTimeFormatter dateTimeFormatter, long timestamp) { + return Instant.ofEpochMilli(timestamp) + // todo - handle timezone offset + .atZone(java.time.ZoneOffset.ofTotalSeconds(0)) + .format(dateTimeFormatter); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/TokenType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/TokenType.java new file mode 100644 index 0000000000000..18068d6fa07a3 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/TokenType.java @@ -0,0 +1,23 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +public final class TokenType extends ParsingType { + + public TokenType( + String name, + EncodingType encodingType, + int numSubTokens, + TimestampFormat timestampFormat, + int[] multiTokenBitmaskByPosition + ) { + super(name, encodingType, numSubTokens, timestampFormat, multiTokenBitmaskByPosition); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/MultiTokenFormat.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/MultiTokenFormat.java new file mode 100644 index 0000000000000..0521119486535 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/MultiTokenFormat.java @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import java.util.List; + +/** + * A multi-token format can only contain valid token names prefixed with '$', separated by token delimiters and optionally + * {@link Schema#getTokenBoundaryCharacters() "token boundary characters"}. The {@link #delimiterParts} are literal strings that + * represent whole parts of the format between tokens, meaning that they may include only {@link Schema#getTokenDelimiters() token + * delimiters} and {@link Schema#getTokenBoundaryCharacters() token boundary characters}. This list is always one element shorter than the + * {@link #tokens} list. + */ +public class MultiTokenFormat { + private final String rawFormat; + private final List delimiterParts; + private final List tokens; + + public MultiTokenFormat(String rawFormat, List delimiterParts, List tokens) { + this.rawFormat = rawFormat; + this.delimiterParts = delimiterParts; + this.tokens = tokens; + } + + public String getRawFormat() { + return rawFormat; + } + + public List getDelimiterParts() { + return delimiterParts; + } + + public List getTokens() { + return tokens; + } + + public int getNumberOfSubTokens() { + int count = 0; + for (TokenType token : tokens) { + count += token.getNumberOfSubTokens(); + } + return count; + } + + @Override + public String toString() { + return rawFormat; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/MultiTokenType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/MultiTokenType.java new file mode 100644 index 0000000000000..0919a0b8d777c --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/MultiTokenType.java @@ -0,0 +1,45 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.Type; + +public class MultiTokenType implements Type { + private final String name; + private final EncodingType encodingType; + private final MultiTokenFormat format; + private final String description; + + public MultiTokenType(String name, EncodingType encodingType, MultiTokenFormat format, String description) { + this.name = name; + this.encodingType = encodingType; + this.format = format; + this.description = description; + } + + public String name() { + return name; + } + + public EncodingType encodingType() { + return encodingType; + } + + public MultiTokenFormat getFormat() { + return format; + } + + public int getNumberOfSubTokens() { + return format.getNumberOfSubTokens(); + } + + public String getDescription() { + return description; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/PatternUtils.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/PatternUtils.java new file mode 100644 index 0000000000000..27176727a3157 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/PatternUtils.java @@ -0,0 +1,426 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class PatternUtils { + /** + * Parse the characters pattern into a char array. + * Handles character ranges (e.g., a-z), character groups (e.g., abcd), + * and escaped characters (e.g., \-). + * + * @param pattern the character pattern to parse (e.g., {@code [0-9]}, {@code [a-zA-Z]}, {@code [\-+0-9]}) + * @return a char array containing all characters represented by the pattern + * @throws IllegalArgumentException if the pattern format is invalid + */ + public static char[] parseCharacters(String pattern) { + // Validate input pattern + if (pattern == null || pattern.isEmpty()) { + throw new IllegalArgumentException("Pattern cannot be null or empty"); + } + + pattern = pattern.trim(); + if (pattern.length() < 2 || pattern.startsWith("[") == false || pattern.endsWith("]") == false) { + throw new IllegalArgumentException("Pattern must be enclosed in square brackets: " + pattern); + } + + // Remove brackets + String content = pattern.substring(1, pattern.length() - 1).trim(); + Set resultChars = new HashSet<>(); + + for (int i = 0; i < content.length(); i++) { + char currentChar = content.charAt(i); + + // Handle escape sequences + if (currentChar == '\\' && i + 1 < content.length()) { + // Add the escaped character directly + resultChars.add(content.charAt(i + 1)); + i++; // Skip the escaped character + } + // Handle character ranges (e.g., a-z) + else if (i + 2 < content.length() && content.charAt(i + 1) == '-') { + char endChar = content.charAt(i + 2); + + if (currentChar > endChar) { + throw new IllegalArgumentException("Invalid character range: " + currentChar + "-" + endChar); + } + + for (char c = currentChar; c <= endChar; c++) { + resultChars.add(c); + } + + i += 2; // Skip the processed range + } + // Add individual character + else { + resultChars.add(currentChar); + } + } + + // Convert set to array + char[] result = new char[resultChars.size()]; + int index = 0; + for (char c : resultChars) { + result[index++] = c; + } + + return result; + } + + public static String[] splitChainedAndConstraints(String constraint) { + return splitToTwoPartsIfNeeded(constraint, OperatorType.AND); + } + + public static String[] splitChainedOrConstraints(String constraint) { + return splitToTwoPartsIfNeeded(constraint, OperatorType.OR); + } + + private static String[] splitToTwoPartsIfNeeded(String constraint, OperatorType operator) { + if (constraint == null || constraint.trim().isEmpty()) { + throw new IllegalArgumentException("Constraint cannot be null or empty"); + } + + int orIdx = constraint.indexOf(operator.getSymbol()); + if (orIdx >= 0) { + String[] ret = new String[2]; + ret[0] = constraint.substring(0, orIdx).trim(); + ret[1] = constraint.substring(orIdx + 2).trim(); + return ret; + } else { + return new String[] { constraint }; + } + } + + /** + * Parse the simple constraint (a single, non-chained constraint) string into a ConstraintDetails object. + * Handles: + * - Comparison operators: {@code >=9}, {@code <10}, {@code !=4}, {@code ==99} + * - Range expressions: {@code 0-255}, {@code (-5)-(-2)} + * - OR values: {@code 5|6|7|9} + * - Length constraints: {@code {9}} + * + * @param constraint the constraint string without logical operators + * @return a ConstraintDetails object containing the operator and operands + * @throws IllegalArgumentException if the constraint format is invalid + */ + public static ConstraintDetails parseSimpleConstraint(String constraint) { + if (constraint == null || constraint.trim().isEmpty()) { + throw new IllegalArgumentException("Constraint cannot be null or empty"); + } + + constraint = constraint.trim(); + + // Handle length constraint {n} + if (constraint.startsWith("{") && constraint.endsWith("}")) { + String lengthStr = constraint.substring(1, constraint.length() - 1).trim(); + return new ConstraintDetails(OperatorType.fromSymbol("{}"), new String[] { lengthStr }); + } + + // Handle set constraints (e.g., "5|6|7|9") or map constraints (e.g., "key1=value1|key2=value2") + if (constraint.contains(OperatorType.SET.getSymbol())) { + boolean isMap = constraint.contains("="); + String[] parts = constraint.split("\\|"); + String[] values; + if (isMap) { + values = new String[parts.length * 2]; + for (int i = 0; i < parts.length; i++) { + String[] keyValue = parts[i].split("="); + if (keyValue.length != 2) { + throw new IllegalArgumentException("Invalid map format: " + parts[i]); + } + values[i * 2] = removeParentheses(keyValue[0].trim()); + values[i * 2 + 1] = removeParentheses(keyValue[1].trim()); + } + return new ConstraintDetails(OperatorType.MAP, values); + } else { + for (int i = 0; i < parts.length; i++) { + parts[i] = removeParentheses(parts[i].trim()); + } + return new ConstraintDetails(OperatorType.SET, parts); + } + } + + // Handle comparison operators + if (constraint.startsWith(OperatorType.GREATER_THAN_OR_EQUAL.getSymbol()) + || constraint.startsWith(OperatorType.LESS_THAN_OR_EQUAL.getSymbol()) + || constraint.startsWith(OperatorType.EQUALITY.getSymbol()) + || constraint.startsWith(OperatorType.NOT_EQUAL.getSymbol())) { + String operator = constraint.substring(0, 2); + String operand = removeParentheses(constraint.substring(2).trim()); + return new ConstraintDetails(OperatorType.fromSymbol(operator), new String[] { operand }); + } + + if (constraint.startsWith(OperatorType.GREATER_THAN.getSymbol()) || constraint.startsWith(OperatorType.LESS_THAN.getSymbol())) { + String operator = constraint.substring(0, 1); + String operand = removeParentheses(constraint.substring(1).trim()); + return new ConstraintDetails(OperatorType.fromSymbol(operator), new String[] { operand }); + } + + // Handle range format (e.g., "0-255", "(-5)-(-2)") + int dashIdx = findRangeSeparatorIndex(constraint); + if (dashIdx > 0) { + String lowerBound = constraint.substring(0, dashIdx).trim(); + String upperBound = constraint.substring(dashIdx + 1).trim(); + lowerBound = removeParentheses(lowerBound); + upperBound = removeParentheses(upperBound); + return new ConstraintDetails(OperatorType.RANGE, new String[] { lowerBound, upperBound }); + } + + throw new IllegalArgumentException("Unrecognized constraint format: " + constraint); + } + + /** + * Removes parentheses from the start and end of a string. + * Converts {@code (-N)} to {@code -N} + */ + private static String removeParentheses(String value) { + if (value != null && value.startsWith("(") && value.endsWith(")")) { + return value.substring(1, value.length() - 1).trim(); + } + return value; + } + + /** + * Finds the index of the range separator dash in a constraint string. + * Handles cases with negative numbers in parentheses. + * + * @param constraint the constraint string to search in + * @return the index of the range separator dash, or -1 if not found + */ + private static int findRangeSeparatorIndex(String constraint) { + int parenLevel = 0; + for (int i = 0; i < constraint.length(); i++) { + char c = constraint.charAt(i); + + if (c == '(') { + parenLevel++; + } else if (c == ')') { + parenLevel--; + } else if (c == '-' && parenLevel == 0) { + // Ensure this is not a negative number at the start + if (i > 0 && isOperatorChar(constraint.charAt(i - 1)) == false) { + return i; + } + } + } + return -1; + } + + /** + * Checks if a character is part of an operator + */ + private static boolean isOperatorChar(char c) { + return c == '=' || c == '<' || c == '>' || c == '!'; + } + + public record ConstraintDetails(OperatorType operator, String[] operands) {} + + /** + * Parses a token format string into a TokenFormat object. + * NOTE: This method may modify the provided subTokenTypes list by adding new ad-hoc subToken types if unknown ones are found + * during parsing. + * + * @param rawFormat The format string containing subTokens (e.g., {@code $octet.$octet.$octet.$octet} or {@code (%X{8})-$dd}) + * @param subTokenDelimiters Array of characters that can separate subTokens + * @param subTokenTypes Map of pre-defined subToken types by name + * @return A TokenFormat object representing the parsed format + * @throws IllegalArgumentException if the format is invalid + */ + public static TokenFormat parseTokenFormat( + String rawFormat, + char[] subTokenDelimiters, + List subTokenBaseTypes, + List subTokenTypes + ) { + if (rawFormat == null || rawFormat.isEmpty()) { + throw new IllegalArgumentException("Format string cannot be null or empty"); + } + + // Split format based on delimiters while preserving them + List parts = new ArrayList<>(); + List usedDelimiters = new ArrayList<>(); + int start = 0; + + int parenLevel = 0; + for (int i = 0; i < rawFormat.length(); i++) { + char c = rawFormat.charAt(i); + if (c == '(') { + parenLevel++; + } else if (c == ')') { + parenLevel--; + } + for (char delimiter : subTokenDelimiters) { + if (c == delimiter && parenLevel == 0) { + // Add the subToken before the delimiter + if (i > start) { + parts.add(removeParentheses(rawFormat.substring(start, i))); + } + // Keep track of the used delimiter + usedDelimiters.add(delimiter); + start = i + 1; + break; + } + } + } + + // Add the last part if there is one + if (start < rawFormat.length()) { + parts.add(removeParentheses(rawFormat.substring(start))); + } + + // Convert the list of used delimiters to an array + char[] usedDelimiterChars = new char[usedDelimiters.size()]; + for (int i = 0; i < usedDelimiters.size(); i++) { + usedDelimiterChars[i] = usedDelimiters.get(i); + } + + // Parse each subToken + SubTokenType[] subTokenArray = new SubTokenType[parts.size()]; + for (int i = 0; i < parts.size(); i++) { + String part = parts.get(i).trim(); + if (part.isEmpty()) { + throw new IllegalArgumentException("Empty subToken in format: " + rawFormat); + } + + if (part.startsWith("$")) { + // Reference to a pre-defined subToken type + String typeName = part.substring(1); + SubTokenType subTokenType = findSubTokenTypeByName(subTokenTypes, typeName); + if (subTokenType == null) { + throw new IllegalArgumentException("Unknown subToken type: " + typeName); + } + subTokenArray[i] = subTokenType; + } else if (part.startsWith("%")) { + // Ad-hoc subToken definition + if (part.length() < 2) { + throw new IllegalArgumentException("Invalid ad-hoc subToken: " + part); + } + + String subTokenName = SubTokenType.ADHOC_PREFIX + part; + SubTokenType subTokenType = findSubTokenTypeByName(subTokenTypes, subTokenName); + if (subTokenType == null) { + char baseTypeSymbol = part.charAt(1); + SubTokenBaseType baseType = findSubTokenBaseTypeBySymbol(subTokenBaseTypes, String.valueOf(baseTypeSymbol)); + if (baseType == null) { + throw new IllegalArgumentException("Unknown base type for ad-hoc subToken: " + baseTypeSymbol); + } + String constraint = part.substring(2).trim(); + subTokenType = new SubTokenType(subTokenName, baseType, constraint, "Ad-hoc subToken"); + // register the new ad-hoc subToken type + subTokenTypes.addLast(subTokenType); + } + subTokenArray[i] = subTokenType; + } else { + throw new IllegalArgumentException("Invalid subToken format: " + part); + } + } + + return new TokenFormat(rawFormat, usedDelimiterChars, subTokenArray); + } + + /** + * Parses a multi-token format string into a list of parts, which can be either a literal string or a {@link TokenType}. + * Handles references to token types (e.g., {@code $time}) and subToken types (e.g., {@code $Mon}). + * + * @param format The multi-token format string (e.g., {@code $Mon, $DD $YYYY} or {@code $datetime $TZA}) + * @param registeredTokenTypes Map of defined token types by name + * @param boundaryChars A set of characters that define the boundaries of a token name + * @param formatTokens Output list to hold the extracted TokenType instances in order + * @param formatDelimiterParts Output list to hold the literal string parts between tokens + * @throws IllegalArgumentException if the format is invalid or contains unknown token references + */ + public static void parseMultiTokenFormat( + String format, + List registeredTokenTypes, + Set boundaryChars, + List formatTokens, + List formatDelimiterParts + ) { + if (format == null || format.isEmpty()) { + throw new IllegalArgumentException("Format string cannot be null or empty"); + } + StringBuilder currentPart = new StringBuilder(); + boolean isTokenNamePart = false; + for (int i = 0; i < format.length(); i++) { + char c = format.charAt(i); + if (isTokenNamePart) { + if (c == '$') { + throw new IllegalArgumentException("Token names must be separated by delimiters: " + format); + } else { + if (boundaryChars.contains(c)) { + // end of token name + if (currentPart.length() > 0) { + addTokenToList(format, registeredTokenTypes, formatTokens, currentPart.toString()); + currentPart.setLength(0); // reset for next part + } else { + throw new IllegalArgumentException("Token name cannot be empty in format: " + format); + } + isTokenNamePart = false; + } + currentPart.append(c); + } + } else { + if (c == '$') { + // end of literal part + if (currentPart.length() > 0) { + formatDelimiterParts.add(currentPart.toString()); + currentPart.setLength(0); + } + isTokenNamePart = true; + } else if (boundaryChars.contains(c)) { + currentPart.append(c); + } else { + throw new IllegalArgumentException( + "Invalid format - only token delimiters and token boundary characters are allowed between tokens: " + format + ); + } + } + } + // Handle the last part + if (isTokenNamePart) { + addTokenToList(format, registeredTokenTypes, formatTokens, currentPart.toString()); + } else { + throw new IllegalArgumentException("Invalid format \"" + format + "\". Multi-token format must end with a valid token"); + } + } + + private static void addTokenToList(String format, List registeredTokens, List tokenList, String tokenName) { + TokenType token = findTokenTypeByName(registeredTokens, tokenName); + if (token == null) { + throw new IllegalArgumentException("Unknown token type: " + tokenName + " in format: " + format); + } + tokenList.add(token); + } + + private static SubTokenType findSubTokenTypeByName(List list, String name) { + for (SubTokenType t : list) { + if (t.name().equals(name)) return t; + } + return null; + } + + private static SubTokenBaseType findSubTokenBaseTypeBySymbol(List list, String symbol) { + for (SubTokenBaseType t : list) { + if (t.symbol().equals(symbol)) return t; + } + return null; + } + + private static TokenType findTokenTypeByName(List list, String name) { + for (TokenType t : list) { + if (t.name().equals(name)) return t; + } + return null; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/Schema.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/Schema.java new file mode 100644 index 0000000000000..6361603c35539 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/Schema.java @@ -0,0 +1,406 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.xcontent.XContentFactory; +import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.TimestampComponentType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.Type; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +/** + * Loads and parses the schema.yaml file for token and sub-token definitions. + */ +@SuppressWarnings({ "unchecked", "DataFlowIssue" }) +public class Schema { + private static final String SCHEMA_PATH = "/schema.yaml"; + + private static volatile Schema instance; + + private final char[] tokenDelimiters; + private final char[] subTokenDelimiters; + private final char[] tokenBoundaryCharacters; + private final ArrayList subTokenBaseTypes; + private final ArrayList subTokenTypes; + private final ArrayList tokenTypes; + private final ArrayList multiTokenTypes; + + public static Schema getInstance() { + if (instance == null) { + synchronized (Schema.class) { + if (instance == null) { + instance = new Schema(); + } + } + } + return instance; + } + + private Schema() { + try { + Map yamlMap = readYamlFileToMap(); + + // Parse special characters + Map specialCharsMap = (Map) yamlMap.get("special_characters"); + // we should not trim the token delimiters, as they contain whitespaces + tokenDelimiters = ((String) specialCharsMap.get("token_delimiters")).toCharArray(); + subTokenDelimiters = getConfigValue(specialCharsMap, "sub_token_delimiters").toCharArray(); + tokenBoundaryCharacters = getConfigValue(specialCharsMap, "token_boundary_characters").toCharArray(); + + Map schemaMap = (Map) yamlMap.get("schema"); + subTokenBaseTypes = parseSubTokenBaseTypes((List) schemaMap.get("sub_token_base_types")); + subTokenTypes = parseSubTokenTypes((List) schemaMap.get("sub_token_types")); + tokenTypes = parseTokenTypes((List) schemaMap.get("token_types")); + multiTokenTypes = parseMultiTokenTypes((List) schemaMap.get("multi_token_types")); + + } catch (Exception e) { + throw new RuntimeException("Error loading schema", e); + } + } + + private ArrayList parseSubTokenBaseTypes(List subTokenBaseTypeConfigs) { + ArrayList subTokenBaseTypes = new ArrayList<>(); + for (Object obj : subTokenBaseTypeConfigs) { + Map config = (Map) obj; + String name = getConfigValue(config, "name"); + String symbol = getConfigValue(config, "symbol"); + String encodingTypeStr = getConfigValue(config, "encoding_type"); + String javaTypeStr = getConfigValue(config, "java_type"); + String charactersPattern = getConfigValue(config, "characters"); + String description = getConfigValue(config, "description"); + + EncodingType encodingType = EncodingType.fromSymbol(encodingTypeStr.charAt(1)); + + // Convert java_type string to actual Class object + Class javaType = switch (javaTypeStr) { + case "int" -> int.class; + case "double" -> double.class; + case "String" -> String.class; + default -> throw new IllegalArgumentException("Unsupported Java type: " + javaTypeStr); + }; + + char[] allowedCharacters = PatternUtils.parseCharacters(charactersPattern); + subTokenBaseTypes.add(new SubTokenBaseType(name, encodingType, symbol, javaType, description, allowedCharacters)); + } + return subTokenBaseTypes; + } + + private ArrayList parseSubTokenTypes(List subTokenTypeConfigs) { + ArrayList result = new ArrayList<>(); + for (Object obj : subTokenTypeConfigs) { + Map config = (Map) obj; + String name = getConfigValue(config, "name"); + String baseTypeStr = getConfigValue(config, "base_type"); + // Remove the '%' prefix + SubTokenBaseType baseType = fromSymbol(baseTypeStr.substring(1)); + + String constraint = getConfigValue(config, "constraint"); + String timestampComponentTypeStr = getConfigValue(config, "timestamp_component_type"); + TimestampComponentType timestampComponentType; + if (timestampComponentTypeStr == null) { + timestampComponentType = TimestampComponentType.NA; + } else { + timestampComponentType = TimestampComponentType.fromSymbol(timestampComponentTypeStr); + } + String description = getConfigValue(config, "description"); + result.addLast(new SubTokenType(name, baseType, constraint, description, timestampComponentType)); + } + return result; + } + + private ArrayList parseTokenTypes(List tokenTypesList) { + ArrayList result = new ArrayList<>(); + for (Object obj : tokenTypesList) { + Map typeMap = (Map) obj; + String name = getConfigValue(typeMap, "name"); + String encodingTypeStr = getConfigValue(typeMap, "encoding_type"); + // Remove the '%' prefix + EncodingType encodingType = EncodingType.fromSymbol(encodingTypeStr.charAt(1)); + + String specialSubTokenDelimitersRaw = getConfigValue(typeMap, "special_sub_token_delimiters"); + char[] actualSubTokenDelimiters = combineCharArrays(specialSubTokenDelimitersRaw, subTokenDelimiters); + + String rawFormat = (String) typeMap.get("format"); + TokenFormat format = PatternUtils.parseTokenFormat(rawFormat, actualSubTokenDelimiters, subTokenBaseTypes, subTokenTypes); + + String description = getConfigValue(typeMap, "description"); + result.addLast(new TokenType(name, encodingType, format, description)); + } + + return result; + } + + private char[] combineCharArrays(String specialSubTokenDelimitersConfig, char[] baseDelimiters) { + char[] actualDelimiters; + if (specialSubTokenDelimitersConfig != null) { + char[] specialSubTokenDelimiters = specialSubTokenDelimitersConfig.toCharArray(); + actualDelimiters = new char[baseDelimiters.length + specialSubTokenDelimiters.length]; + System.arraycopy(baseDelimiters, 0, actualDelimiters, 0, baseDelimiters.length); + System.arraycopy(specialSubTokenDelimiters, 0, actualDelimiters, baseDelimiters.length, specialSubTokenDelimiters.length); + } else { + actualDelimiters = baseDelimiters; + } + return actualDelimiters; + } + + ArrayList parseMultiTokenTypes(List multiTokenTypesList) { + ArrayList result = new ArrayList<>(); + // Create the set of boundary characters once + Set boundaryChars = getAllTokenBoundaryChars(); + + for (Object obj : multiTokenTypesList) { + Map typeMap = (Map) obj; + String name = getConfigValue(typeMap, "name"); + String encodingTypeStr = getConfigValue(typeMap, "encoding_type"); + // Remove the '%' prefix + EncodingType encodingType = EncodingType.fromSymbol(encodingTypeStr.charAt(1)); + + String rawFormat = (String) typeMap.get("format"); + List formatTokens = new ArrayList<>(); + List formatDelimiterParts = new ArrayList<>(); + PatternUtils.parseMultiTokenFormat(rawFormat, tokenTypes, boundaryChars, formatTokens, formatDelimiterParts); + MultiTokenFormat format = new MultiTokenFormat(rawFormat, formatDelimiterParts, formatTokens); + + String description = getConfigValue(typeMap, "description"); + result.addLast(new MultiTokenType(name, encodingType, format, description)); + } + + return result; + } + + public Set getAllTokenBoundaryChars() { + Set boundaryChars = new HashSet<>(); + for (char c : getTokenDelimiters()) { + boundaryChars.add(c); + } + for (char c : getTokenBoundaryCharacters()) { + boundaryChars.add(c); + } + return boundaryChars; + } + + public char[] getTokenDelimiters() { + return tokenDelimiters; + } + + public char[] getSubTokenDelimiters() { + return subTokenDelimiters; + } + + public char[] getTokenBoundaryCharacters() { + return tokenBoundaryCharacters; + } + + public ArrayList getSubTokenBaseTypes() { + return subTokenBaseTypes; + } + + public ArrayList getSubTokenTypes() { + return subTokenTypes; + } + + public ArrayList getTokenTypes() { + return tokenTypes; + } + + public ArrayList getMultiTokenTypes() { + return multiTokenTypes; + } + + public SubTokenBaseType fromSymbol(String symbol) { + for (SubTokenBaseType baseType : subTokenBaseTypes) { + if (baseType.symbol().equals(symbol)) { + return baseType; + } + } + return null; + } + + public SubTokenType getSubTokenType(String name) { + return getTypeByName(subTokenTypes, name); + } + + public TokenType getTokenType(String name) { + return getTypeByName(tokenTypes, name); + } + + public MultiTokenType getMultiTokenType(String name) { + return getTypeByName(multiTokenTypes, name); + } + + public static T getTypeByName(List types, String name) { + for (T type : types) { + if (type.name().equals(name)) { + return type; + } + } + return null; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("Schema {\n"); + + // Special Characters + sb.append(" Special Characters:\n"); + sb.append(" TokenDelimiters: ["); + printCharArray(tokenDelimiters, sb); + sb.append("]\n"); + + sb.append(" SubTokenDelimiters: ["); + printCharArray(subTokenDelimiters, sb); + sb.append("]\n"); + + sb.append(" TokenBoundaryCharacters: ["); + printCharArray(tokenBoundaryCharacters, sb); + sb.append("]\n\n"); + + // SubTokenBaseTypes + sb.append(" SubTokenBaseTypes {\n"); + subTokenBaseTypes.forEach( + baseType -> sb.append(" ") + .append(baseType.name()) + .append(" {\n") + .append(" symbol: ") + .append(baseType.symbol()) + .append("\n") + .append(" encodingType: ") + .append(baseType.encodingType().name()) + .append("\n") + .append(" baseType: ") + .append(baseType.baseType().getSimpleName()) + .append("\n") + .append(" description: ") + .append(baseType.description()) + .append("\n") + .append(" }\n") + ); + sb.append(" }\n\n"); + + // SubTokenTypes + sb.append(" SubTokenTypes {\n"); + subTokenTypes.forEach(subTokenType -> { + sb.append(" ") + .append(subTokenType.name()) + .append(" {\n") + .append(" baseType: ") + .append(subTokenType.getBaseType().name()) + .append("\n") + .append(" constraint: "); + + // Add the appropriate constraint based on the base type + if (subTokenType.getIntConstraint() != null) { + sb.append("Integer constraint"); + } else if (subTokenType.getStringConstraint() != null) { + sb.append("String constraint"); + } + + sb.append("\n").append(" timestampComponentType: ").append(subTokenType.getTimestampComponentType()).append("\n"); + + sb.append(" description: ").append(subTokenType.getDescription()).append("\n").append(" }\n"); + }); + sb.append(" }\n\n"); + + // TokenTypes + sb.append(" TokenTypes {\n"); + tokenTypes.forEach(tokenType -> { + sb.append(" ") + .append(tokenType.name()) + .append(" {\n") + .append(" encodingType: ") + .append(tokenType.encodingType()) + .append("\n") + .append(" format: ") + .append(tokenType.format()) + .append("\n") + .append(" description: ") + .append(tokenType.description()) + .append("\n") + .append(" }\n"); + }); + sb.append(" }\n\n"); + + // MultiTokenTypes + sb.append(" MultiTokenTypes {\n"); + multiTokenTypes.forEach(multiTokenType -> { + sb.append(" ") + .append(multiTokenType.name()) + .append(" {\n") + .append(" encodingType: ") + .append(multiTokenType.encodingType()) + .append("\n") + .append(" format: ") + .append(multiTokenType.getFormat()) + .append("\n") + .append(" description: ") + .append(multiTokenType.getDescription()) + .append("\n") + .append(" }\n"); + }); + sb.append(" }\n"); + + sb.append("}"); + return sb.toString(); + } + + private void printCharArray(char[] tokenDelimiters, StringBuilder sb) { + for (int i = 0; i < tokenDelimiters.length; i++) { + char character = tokenDelimiters[i]; + sb.append(escapeChar(character)); + if (i < tokenDelimiters.length - 1) { + sb.append(","); + } + } + } + + private String escapeChar(char c) { + return switch (c) { + case '\t' -> "\\t"; + case '\n' -> "\\n"; + case '\r' -> "\\r"; + case '\f' -> "\\f"; + case '\b' -> "\\b"; + case '\\' -> "\\\\"; + case '\"' -> "\\\""; + case '\'' -> "\\'"; + case ' ' -> "SPACE"; + default -> { + if (Character.isISOControl(c)) { + yield String.format(Locale.ROOT, "\\u%04x", (int) c); + } + yield String.valueOf(c); + } + }; + } + + private static String getConfigValue(Map typeMap, String symbol) { + String ret = (String) typeMap.get(symbol); + return ret == null ? null : ret.trim(); + } + + private static Map readYamlFileToMap() throws IOException { + try ( + InputStream schemaFileIS = Schema.class.getResourceAsStream(SCHEMA_PATH); + XContentParser yamlParser = XContentFactory.xContent(XContentType.YAML) + .createParser(XContentParserConfiguration.EMPTY, schemaFileIS) + ) { + return yamlParser.map(); + } + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SubTokenBaseType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SubTokenBaseType.java new file mode 100644 index 0000000000000..78fbf5403fb46 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SubTokenBaseType.java @@ -0,0 +1,25 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +public record SubTokenBaseType( + String name, + EncodingType encodingType, + String symbol, + Class baseType, + String description, + char[] allowedCharacters +) { + + @Override + public String toString() { + return name; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SubTokenType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SubTokenType.java new file mode 100644 index 0000000000000..d2e99259b86df --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SubTokenType.java @@ -0,0 +1,110 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.TimestampComponentType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.Type; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.IntConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.IntConstraints; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringConstraints; + +public class SubTokenType implements Type { + public static final String ADHOC_PREFIX = "adhoc_"; + + private final String name; + private final String description; + private final SubTokenBaseType baseType; + private final String rawConstraint; + private final IntConstraint intConstraint; + private final StringConstraint stringConstraint; + private final TimestampComponentType timestampComponentType; + + public SubTokenType(String name, SubTokenBaseType baseType, String constraint, String description) { + this(name, baseType, constraint, description, TimestampComponentType.NA); + } + + public SubTokenType( + String name, + SubTokenBaseType baseType, + String constraint, + String description, + TimestampComponentType timestampComponentType + ) { + this.name = name; + this.baseType = baseType; + this.rawConstraint = constraint; + this.description = description; + this.timestampComponentType = timestampComponentType; + + Class type = baseType.baseType(); + + if (int.class.equals(type)) { + this.intConstraint = IntConstraints.parseIntConstraint(constraint); + this.stringConstraint = null; + } else if (String.class.equals(type)) { + this.intConstraint = null; + this.stringConstraint = StringConstraints.parseStringConstraint(constraint); + } else { + throw new IllegalArgumentException("Unsupported base type: " + baseType.baseType()); + } + } + + public String name() { + return name; + } + + public SubTokenBaseType getBaseType() { + return baseType; + } + + public String getDescriptor() { + if (name.startsWith(SubTokenType.ADHOC_PREFIX)) { + return "%" + baseType.symbol() + rawConstraint; + } + return name; + } + + public String getDescription() { + return description; + } + + @Override + public EncodingType encodingType() { + return baseType.encodingType(); + } + + public IntConstraint getIntConstraint() { + return intConstraint; + } + + public StringConstraint getStringConstraint() { + return stringConstraint; + } + + public TimestampComponentType getTimestampComponentType() { + return timestampComponentType; + } + + public char[] getValidCharacters() { + if (intConstraint != null) { + return baseType.allowedCharacters(); + } + if (stringConstraint != null) { + char[] validCharacters = stringConstraint.getValidCharacters(); + if (validCharacters == null) { + // if no specific characters are defined, return the base type's allowed characters + return baseType.allowedCharacters(); + } + return validCharacters; + } + // no constraints defined, return the base type's allowed characters + return baseType.allowedCharacters(); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/TokenFormat.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/TokenFormat.java new file mode 100644 index 0000000000000..823191b8210ca --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/TokenFormat.java @@ -0,0 +1,48 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +public final class TokenFormat { + private final String rawFormat; + private final char[] subTokenDelimiters; + private final SubTokenType[] subTokenTypes; + + public TokenFormat(String rawFormat, char[] subTokenDelimiters, SubTokenType[] subTokens) { + this.rawFormat = rawFormat; + this.subTokenDelimiters = subTokenDelimiters; + this.subTokenTypes = subTokens; + } + + public String getRawFormat() { + return rawFormat; + } + + public char[] getSubTokenDelimiters() { + return subTokenDelimiters; + } + + public SubTokenType[] getSubTokenTypes() { + return subTokenTypes; + } + + public int getNumberOfSubTokens() { + return subTokenTypes.length; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < subTokenTypes.length; i++) { + sb.append('$').append(subTokenTypes[i].name()); + if (i < subTokenDelimiters.length) { + sb.append(subTokenDelimiters[i]); + } + } + return sb.toString(); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/TokenType.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/TokenType.java new file mode 100644 index 0000000000000..1f34ebe7a8c50 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/TokenType.java @@ -0,0 +1,17 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.Type; + +public record TokenType(String name, EncodingType encodingType, TokenFormat format, String description) implements Type { + public int getNumberOfSubTokens() { + return format.getNumberOfSubTokens(); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AndStringConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AndStringConstraint.java new file mode 100644 index 0000000000000..8d173a1648d06 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AndStringConstraint.java @@ -0,0 +1,49 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +import java.util.HashSet; +import java.util.Set; + +public record AndStringConstraint(StringConstraint first, StringConstraint second) implements StringConstraint { + + @Override + public OperatorType getType() { + return OperatorType.AND; + } + + @Override + public char[] getValidCharacters() { + char[] thisChars = first.getValidCharacters(); + char[] otherChars = second.getValidCharacters(); + if (thisChars == null) { + return otherChars; + } + if (otherChars == null) { + return thisChars; + } + // return the intersection of valid characters + Set validChars = new HashSet<>(); + for (char c : thisChars) { + if (new String(otherChars).indexOf(c) >= 0) { + validChars.add(c); + } + } + return validChars.stream() + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString() + .toCharArray(); + } + + @Override + public boolean isApplicable(String value) { + return first.isApplicable(value) && second.isApplicable(value); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AnyInteger.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AnyInteger.java new file mode 100644 index 0000000000000..48b4056425233 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AnyInteger.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +/** + * No constraint - all values are valid. + */ +public final class AnyInteger implements IntConstraint { + public static final IntConstraint INSTANCE = new AnyInteger(); + + private AnyInteger() {} + + @Override + public boolean isApplicable(int value) { + return true; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, Integer.MAX_VALUE) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AnyString.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AnyString.java new file mode 100644 index 0000000000000..a14202b2f6288 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/AnyString.java @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +public record AnyString() implements StringConstraint { + public static final StringConstraint INSTANCE = new AnyString(); + + @Override + public OperatorType getType() { + return OperatorType.ANY; + } + + @Override + public char[] getValidCharacters() { + return null; + } + + @Override + public boolean isApplicable(String value) { + return true; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/EqualsIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/EqualsIntConstraint.java new file mode 100644 index 0000000000000..183695d75c89a --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/EqualsIntConstraint.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Equality (==) +public final class EqualsIntConstraint implements IntConstraint { + private final int targetValue; + + EqualsIntConstraint(int targetValue) { + this.targetValue = targetValue; + } + + @Override + public boolean isApplicable(int value) { + return value == targetValue; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { new IntConstraints.Range(targetValue, targetValue) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/EqualsStringConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/EqualsStringConstraint.java new file mode 100644 index 0000000000000..21a83e66b008d --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/EqualsStringConstraint.java @@ -0,0 +1,33 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +public record EqualsStringConstraint(String targetValue) implements StringConstraint { + + @Override + public OperatorType getType() { + return OperatorType.EQUALITY; + } + + @Override + public char[] getValidCharacters() { + return targetValue.chars() + .distinct() + .mapToObj(c -> (char) c) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString() + .toCharArray(); + } + + @Override + public boolean isApplicable(String value) { + return targetValue.equals(value); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/GreaterThanIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/GreaterThanIntConstraint.java new file mode 100644 index 0000000000000..1952b4516c19a --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/GreaterThanIntConstraint.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Greater Than (>) +public final class GreaterThanIntConstraint implements IntConstraint { + private final int threshold; + + public GreaterThanIntConstraint(int threshold) { + this.threshold = threshold; + } + + @Override + public boolean isApplicable(int value) { + return value > threshold; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { new IntConstraints.Range(threshold + 1, Integer.MAX_VALUE) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/GreaterThanOrEqualIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/GreaterThanOrEqualIntConstraint.java new file mode 100644 index 0000000000000..8e27db32527c2 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/GreaterThanOrEqualIntConstraint.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Greater Than or Equal (>=) +public final class GreaterThanOrEqualIntConstraint implements IntConstraint { + private final int threshold; + + public GreaterThanOrEqualIntConstraint(int threshold) { + this.threshold = threshold; + } + + @Override + public boolean isApplicable(int value) { + return value >= threshold; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { new IntConstraints.Range(threshold, Integer.MAX_VALUE) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraint.java new file mode 100644 index 0000000000000..c2bca76985e3d --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraint.java @@ -0,0 +1,107 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; + +public interface IntConstraint { + /** + * Returns an ordered array of non-overlapping inclusive ranges that represent the valid values for this constraint. + * + * @return an array of Range objects representing the valid ranges + */ + IntConstraints.Range[] trueRanges(); + + /** + * Tests if the given integer value satisfies this constraint. + * @param value the integer value to test + * @return true if the value satisfies the constraint, false otherwise + */ + boolean isApplicable(int value); + + default IntConstraint and(IntConstraint constraint) { + if (constraint == null) { + throw new IllegalArgumentException("Constraint cannot be null"); + } + + return new IntConstraint() { + @Override + public IntConstraints.Range[] trueRanges() { + IntConstraints.Range[] thisRanges = IntConstraint.this.trueRanges(); + IntConstraints.Range[] otherRanges = constraint.trueRanges(); + ArrayList combinedRanges = new ArrayList<>(); + for (IntConstraints.Range r1 : thisRanges) { + for (IntConstraints.Range r2 : otherRanges) { + if (r1.upperBound() >= r2.lowerBound() && r1.lowerBound() <= r2.upperBound()) { + combinedRanges.add( + new IntConstraints.Range( + Math.max(r1.lowerBound(), r2.lowerBound()), + Math.min(r1.upperBound(), r2.upperBound()) + ) + ); + } + } + } + return combinedRanges.toArray(new IntConstraints.Range[0]); + } + + @Override + public boolean isApplicable(int value) { + return IntConstraint.this.isApplicable(value) && constraint.isApplicable(value); + } + }; + } + + default IntConstraint or(IntConstraint constraint) { + if (constraint == null) { + throw new IllegalArgumentException("Constraint cannot be null"); + } + + return new IntConstraint() { + @Override + public IntConstraints.Range[] trueRanges() { + IntConstraints.Range[] thisRanges = IntConstraint.this.trueRanges(); + IntConstraints.Range[] otherRanges = constraint.trueRanges(); + ArrayList combinedRanges = new ArrayList<>(); + combinedRanges.addAll(Arrays.asList(thisRanges)); + combinedRanges.addAll(Arrays.asList(otherRanges)); + combinedRanges.sort(Comparator.comparingInt(IntConstraints.Range::lowerBound)); + + // merge overlapping ranges + ArrayList mergedRanges = new ArrayList<>(); + IntConstraints.Range current = null; + for (IntConstraints.Range range : combinedRanges) { + if (current == null) { + current = range; + } else if (current.upperBound() >= range.lowerBound()) { + // Overlapping ranges, merge them + current = new IntConstraints.Range( + Math.min(current.lowerBound(), range.lowerBound()), + Math.max(current.upperBound(), range.upperBound()) + ); + } else { + // no overlap, add the current range and make the new range current + mergedRanges.add(current); + current = range; + } + } + if (current != null) { + mergedRanges.add(current); + } + return mergedRanges.toArray(new IntConstraints.Range[0]); + } + + @Override + public boolean isApplicable(int value) { + return IntConstraint.this.isApplicable(value) || constraint.isApplicable(value); + } + }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraints.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraints.java new file mode 100644 index 0000000000000..02b1bab248b32 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraints.java @@ -0,0 +1,126 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.PatternUtils; + +public class IntConstraints { + + /** + * Represents an inclusive range of integers with a lower and upper bound. + */ + public record Range(int lowerBound, int upperBound) {} + + /** + * Parses a raw constraint string from schema.yaml and returns an {@link IntConstraint} + * that implements the constraint. + * Handles: + *
    + *
  • Simple constraints: {@code 0-255}, {@code >=0}, {@code ==7}, etc.
  • + *
  • Logical operators: {@code &&} (AND), {@code ||} (OR)
  • + *
  • Negative numbers: {@code (-128)} for negative values
  • + *
+ * + * @param rawConstraint the constraint string as defined in schema.yaml + * @return an IntConstraint that evaluates the constraint + * @throws IllegalArgumentException if the constraint format is invalid + */ + public static IntConstraint parseIntConstraint(String rawConstraint) { + if (rawConstraint == null || rawConstraint.trim().isEmpty()) { + return AnyInteger.INSTANCE; + } + + String constraint = rawConstraint.trim(); + + // Handle logical operators first + String[] chainedConstraints = PatternUtils.splitChainedAndConstraints(constraint); + if (chainedConstraints.length == 2) { + return parseIntConstraint(chainedConstraints[0]).and(parseIntConstraint(chainedConstraints[1])); + } + chainedConstraints = PatternUtils.splitChainedOrConstraints(constraint); + if (chainedConstraints.length == 2) { + return parseIntConstraint(chainedConstraints[0]).or(parseIntConstraint(chainedConstraints[1])); + } + + // Handle simple constraints + PatternUtils.ConstraintDetails details = PatternUtils.parseSimpleConstraint(constraint); + String[] stringOperands = details.operands(); + int[] intOperands = new int[stringOperands.length]; + + // Convert string operands to integers + for (int i = 0; i < intOperands.length; i++) { + try { + intOperands[i] = Integer.parseInt(stringOperands[i]); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Invalid numeric value in constraint: " + stringOperands[i]); + } + } + + // Create and return the appropriate IntPredicate + return createConstraint(details.operator(), intOperands); + } + + /** + * Creates an {@link IntConstraint} based on the given operator and operands. + * + * @param operator the operator type from OperatorType enum + * @param operands the integer operands for the constraint + * @return an IntConstraint that evaluates the constraint + * @throws IllegalArgumentException if the operator is unsupported or operands are invalid + */ + public static IntConstraint createConstraint(OperatorType operator, int... operands) { + return switch (operator) { + case EQUALITY -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new EqualsIntConstraint(operands[0]); + } + case LESS_THAN -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new LessThanIntConstraint(operands[0]); + } + case GREATER_THAN -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new GreaterThanIntConstraint(operands[0]); + } + case LESS_THAN_OR_EQUAL -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new LessThanOrEqualIntConstraint(operands[0]); + } + case GREATER_THAN_OR_EQUAL -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new GreaterThanOrEqualIntConstraint(operands[0]); + } + case NOT_EQUAL -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new NotEqualsIntConstraint(operands[0]); + } + case RANGE -> { + assertTwoOperands(operator.getSymbol(), operands); + yield new RangeIntConstraint(operands[0], operands[1]); + } + case SET -> new SetIntConstraint(operands); + case LENGTH -> new LengthIntConstraint(operands[0]); + default -> throw new IllegalArgumentException( + "The operator '" + operator.getSymbol() + "' is not supported for integer constraints" + ); + }; + } + + private static void assertSingleOperand(String operator, int[] operands) { + if (operands.length != 1) { + throw new IllegalArgumentException("Operator '" + operator + "' expects exactly one operand, but got " + operands.length); + } + } + + private static void assertTwoOperands(String operator, int[] operands) { + if (operands.length != 2) { + throw new IllegalArgumentException("Operator '" + operator + "' expects exactly two operands, but got " + operands.length); + } + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LengthIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LengthIntConstraint.java new file mode 100644 index 0000000000000..d44ebde8ad528 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LengthIntConstraint.java @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Length constraint ({}) +public final class LengthIntConstraint implements IntConstraint { + int lowerBound; + int upperBound; + + public LengthIntConstraint(int length) { + this.lowerBound = (int) Math.pow(10, length - 1); + this.upperBound = (int) Math.pow(10, length) - 1; + } + + @Override + public boolean isApplicable(int value) { + return value >= lowerBound && value <= upperBound; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { new IntConstraints.Range(lowerBound, upperBound) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LengthStringConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LengthStringConstraint.java new file mode 100644 index 0000000000000..528cfba136086 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LengthStringConstraint.java @@ -0,0 +1,28 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +public record LengthStringConstraint(int requiredLength) implements StringConstraint { + + @Override + public OperatorType getType() { + return OperatorType.LENGTH; + } + + @Override + public char[] getValidCharacters() { + return null; + } + + @Override + public boolean isApplicable(String value) { + return value.length() == requiredLength; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LessThanIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LessThanIntConstraint.java new file mode 100644 index 0000000000000..b2be84495c192 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LessThanIntConstraint.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Less Than (<) +public final class LessThanIntConstraint implements IntConstraint { + private final int threshold; + + public LessThanIntConstraint(int threshold) { + this.threshold = threshold; + } + + @Override + public boolean isApplicable(int value) { + return value < threshold; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, threshold - 1) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LessThanOrEqualIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LessThanOrEqualIntConstraint.java new file mode 100644 index 0000000000000..b71cb7232af21 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/LessThanOrEqualIntConstraint.java @@ -0,0 +1,27 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Less Than or Equal (<=) +public final class LessThanOrEqualIntConstraint implements IntConstraint { + private final int threshold; + + public LessThanOrEqualIntConstraint(int threshold) { + this.threshold = threshold; + } + + @Override + public boolean isApplicable(int value) { + return value <= threshold; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, threshold) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/NotEqualsIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/NotEqualsIntConstraint.java new file mode 100644 index 0000000000000..7b1b2f920c9dd --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/NotEqualsIntConstraint.java @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Not Equal (!=) +public final class NotEqualsIntConstraint implements IntConstraint { + private final int targetValue; + + public NotEqualsIntConstraint(int targetValue) { + this.targetValue = targetValue; + } + + @Override + public boolean isApplicable(int value) { + return value != targetValue; + } + + @Override + public IntConstraints.Range[] trueRanges() { + return new IntConstraints.Range[] { + new IntConstraints.Range(Integer.MIN_VALUE, targetValue - 1), + new IntConstraints.Range(targetValue + 1, Integer.MAX_VALUE) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/NotEqualsStringConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/NotEqualsStringConstraint.java new file mode 100644 index 0000000000000..53882d9f8bf47 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/NotEqualsStringConstraint.java @@ -0,0 +1,28 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +public record NotEqualsStringConstraint(String targetValue) implements StringConstraint { + + @Override + public OperatorType getType() { + return OperatorType.NOT_EQUAL; + } + + @Override + public char[] getValidCharacters() { + return null; + } + + @Override + public boolean isApplicable(String value) { + return targetValue.equals(value) == false; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/OrStringConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/OrStringConstraint.java new file mode 100644 index 0000000000000..4fa2e4d69c649 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/OrStringConstraint.java @@ -0,0 +1,47 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +import java.util.HashSet; +import java.util.Set; + +public record OrStringConstraint(StringConstraint first, StringConstraint second) implements StringConstraint { + + @Override + public OperatorType getType() { + return OperatorType.OR; + } + + @Override + public char[] getValidCharacters() { + char[] thisChars = first.getValidCharacters(); + char[] otherChars = second.getValidCharacters(); + if (thisChars == null || otherChars == null) { + return null; + } + // return the union of valid characters + Set validChars = new HashSet<>(); + for (char c : thisChars) { + validChars.add(c); + } + for (char c : otherChars) { + validChars.add(c); + } + return validChars.stream() + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString() + .toCharArray(); + } + + @Override + public boolean isApplicable(String value) { + return first.isApplicable(value) || second.isApplicable(value); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/RangeIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/RangeIntConstraint.java new file mode 100644 index 0000000000000..0900bc1af7a03 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/RangeIntConstraint.java @@ -0,0 +1,35 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +// Inclusive Range (-) +public final class RangeIntConstraint implements IntConstraint { + private final int lowerBound; + private final int upperBound; + + public RangeIntConstraint(int lowerBound, int upperBound) { + if (lowerBound > upperBound) { + throw new IllegalArgumentException("Lower bound cannot be greater than upper bound in range constraint"); + } + this.lowerBound = lowerBound; + this.upperBound = upperBound; + } + + @Override + public boolean isApplicable(int value) { + return value >= lowerBound && value <= upperBound; + } + + @Override + public IntConstraints.Range[] trueRanges() { + if (lowerBound > upperBound) { + throw new IllegalArgumentException("Lower bound cannot be greater than upper bound in range constraint"); + } + return new IntConstraints.Range[] { new IntConstraints.Range(lowerBound, upperBound) }; + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/SetIntConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/SetIntConstraint.java new file mode 100644 index 0000000000000..1029232438291 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/SetIntConstraint.java @@ -0,0 +1,36 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Set; +import java.util.stream.Collectors; + +// OR operator (|) +public final class SetIntConstraint implements IntConstraint { + private final int[] allowedValues; + private final Set allowedValuesSet; + + public SetIntConstraint(int[] allowedValues) { + this.allowedValuesSet = Arrays.stream(allowedValues).boxed().collect(Collectors.toSet()); + this.allowedValues = allowedValues; + } + + @Override + public boolean isApplicable(int value) { + return allowedValuesSet.contains(value); + } + + @Override + public IntConstraints.Range[] trueRanges() { + ArrayList ranges = new ArrayList<>(); + Arrays.stream(allowedValues).sorted().forEach(val -> ranges.add(new IntConstraints.Range(val, val))); + return ranges.toArray(new IntConstraints.Range[0]); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraint.java new file mode 100644 index 0000000000000..93bf7ed960b54 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraint.java @@ -0,0 +1,47 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +public interface StringConstraint { + + /** + * Returns the type of this constraint, which is one of the {@link OperatorType} values. + * This method is used to determine how the constraint should be applied in logical operations. + * + * @return the operator type of this constraint + */ + OperatorType getType(); + + /** + * Returns the characters that are valid for this constraint. For example, a Set constraint that includes the values "One", "Two" + * and "Three" would return the characters {'O', 'n', 'e', 'T', 'w', 'o', 'h', 'r', 'e'}. + * If the constraint does not have a specific set of valid characters, this method should return null. + * If no characters are valid, this method should return an empty array. + * + * @return an array of valid characters for this constraint, or null if there are no specific valid characters + */ + char[] getValidCharacters(); + + /** + * Evaluates the constraint against a given string value. + * + * @param value the string value to evaluate + * @return true if the value satisfies the constraint, false otherwise + */ + boolean isApplicable(String value); + + default StringConstraint and(final StringConstraint other) { + return new AndStringConstraint(this, other); + } + + default StringConstraint or(final StringConstraint other) { + return new OrStringConstraint(this, other); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraints.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraints.java new file mode 100644 index 0000000000000..424860061e1de --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraints.java @@ -0,0 +1,109 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.PatternUtils; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; + +public class StringConstraints { + + /** + * Parses a raw constraint string from schema.yaml and returns a {@link StringConstraint} + * that implements the constraint. + * Handles: + *
    + *
  • Simple constraints: {@code ==value}, {@code !=value}, {@code Jan|Feb|Mar}, {@code {3}}
  • + *
  • Logical operators: {@code &&} (AND), {@code ||} (OR)
  • + *
+ * + * @param rawConstraint the constraint string as defined in schema.yaml + * @return a {@link StringConstraint} that evaluates the constraint + */ + public static StringConstraint parseStringConstraint(String rawConstraint) { + if (rawConstraint == null || rawConstraint.trim().isEmpty()) { + return AnyString.INSTANCE; + } + + String constraint = rawConstraint.trim(); + + // Handle logical operators first + String[] chainedConstraints = PatternUtils.splitChainedAndConstraints(constraint); + if (chainedConstraints.length == 2) { + return parseStringConstraint(chainedConstraints[0]).and(parseStringConstraint(chainedConstraints[1])); + } + chainedConstraints = PatternUtils.splitChainedOrConstraints(constraint); + if (chainedConstraints.length == 2) { + return parseStringConstraint(chainedConstraints[0]).or(parseStringConstraint(chainedConstraints[1])); + } + + // Handle simple constraints + PatternUtils.ConstraintDetails details = PatternUtils.parseSimpleConstraint(constraint); + return createConstraint(details.operator(), details.operands()); + } + + /** + * Creates a {@link StringConstraint} based on the given operator and operands. + * + * @param operator the operator type from OperatorType enum + * @param operands the string operands for the constraint + * @return a {@link StringConstraint} that evaluates the constraint + * @throws IllegalArgumentException if the operator is unsupported or operands are invalid + */ + public static StringConstraint createConstraint(OperatorType operator, String... operands) { + return switch (operator) { + case EQUALITY -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new EqualsStringConstraint(operands[0]); + } + case NOT_EQUAL -> { + assertSingleOperand(operator.getSymbol(), operands); + yield new NotEqualsStringConstraint(operands[0]); + } + case SET -> new StringSetConstraint(new HashSet<>(Arrays.asList(operands))); + case MAP -> { + if (operands.length % 2 != 0) { + throw new IllegalArgumentException("Map operator requires an even number of operands, got: " + operands.length); + } + Map map = new HashMap<>(); + for (int i = 0; i < operands.length; i += 2) { + String key = operands[i]; + try { + int value = Integer.parseInt(operands[i + 1]); + map.put(key, value); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Map operator requires numeric values, got: " + operands[i + 1]); + } + } + yield new StringToIntMapConstraint(map); + } + case LENGTH -> { + assertSingleOperand(operator.getSymbol(), operands); + try { + int length = Integer.parseInt(operands[0]); + yield new LengthStringConstraint(length); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Length constraint requires a numeric value, got: " + operands[0]); + } + } + default -> throw new IllegalArgumentException( + "The operator '" + operator.getSymbol() + "' is not supported for string constraints" + ); + }; + } + + private static void assertSingleOperand(String operator, String[] operands) { + if (operands.length != 1) { + throw new IllegalArgumentException("Operator '" + operator + "' expects exactly one operand, but got " + operands.length); + } + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringSetConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringSetConstraint.java new file mode 100644 index 0000000000000..28bd261e08b49 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringSetConstraint.java @@ -0,0 +1,35 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +import java.util.Set; + +public record StringSetConstraint(Set keys) implements StringConstraint { + + @Override + public OperatorType getType() { + return OperatorType.SET; + } + + @Override + public char[] getValidCharacters() { + return keys.stream() + .flatMap(value -> value.chars().mapToObj(c -> (char) c)) + .distinct() + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString() + .toCharArray(); + } + + @Override + public boolean isApplicable(String value) { + return keys.contains(value); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringToIntMapConstraint.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringToIntMapConstraint.java new file mode 100644 index 0000000000000..09bcecd615876 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringToIntMapConstraint.java @@ -0,0 +1,36 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.OperatorType; + +import java.util.Map; + +public record StringToIntMapConstraint(Map map) implements StringConstraint { + + @Override + public OperatorType getType() { + return OperatorType.MAP; + } + + @Override + public char[] getValidCharacters() { + return map.entrySet() + .stream() + .flatMap(entry -> entry.getKey().chars().mapToObj(c -> (char) c)) + .distinct() + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString() + .toCharArray(); + } + + @Override + public boolean isApplicable(String value) { + return map.containsKey(value); + } +} diff --git a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patterntext/PatternTextValueProcessor.java b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patterntext/PatternTextValueProcessor.java index 66954daaf940d..6e240f6c00aa0 100644 --- a/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patterntext/PatternTextValueProcessor.java +++ b/x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patterntext/PatternTextValueProcessor.java @@ -48,7 +48,7 @@ static String templateId(String template) { return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(hashBytes); } - static Parts split(String text) { + public static Parts split(String text) { if (text.length() > MAX_LOG_LEN_TO_STORE_AS_DOC_VALUE) { return splitInternal(CharBuffer.wrap(text).subSequence(0, MAX_LOG_LEN_TO_STORE_AS_DOC_VALUE), true); } else { diff --git a/x-pack/plugin/logsdb/src/main/resources/schema.yaml b/x-pack/plugin/logsdb/src/main/resources/schema.yaml new file mode 100644 index 0000000000000..645ae3a54dd81 --- /dev/null +++ b/x-pack/plugin/logsdb/src/main/resources/schema.yaml @@ -0,0 +1,344 @@ +# SCHEMA DEFINITION FILE +# This file contains definitions for parsing text using predefined constructs +# It is structured hierarchically, with each component building upon previous components +# + +# -------------------------------------------------- Definitions section -------------------------------------------------- +# This section is informative and documents the available constructs that can be used in the schema section. +# Changes here will not affect the parser directly, but serve as reference for what's available in the schema section below. +predefined_constructs: + # Allowed pattern syntax for the definition of subToken base types and token types. + pattern_syntax: + # NOTE: Special characters in patterns need proper escaping: + # '-' needs to be escaped with '\', and '\' needs to be escaped with '\\', so use '\\-' and '\\' in the pattern respectively. + character_groups: "[abcd]" # Matches any single character in the group, e.g. "[abcd]" matches 'a', 'b', 'c', or 'd' + character_ranges: "[0-9a-f]" # Matches any single character in the range, e.g. "[0-9a-f]" matches digits and a-f + # Operators are used in constraints to define the rules for subToken types. See the examples below for more details. + operators: + equality: "==" # Exact equality, e.g. "==7" means the value must be exactly 7 + inequalities: ["<", ">", "<=", ">=", "!="] # Standard inequality operators, e.g. "<5" means less than 5 + inclusive_range: "-" # Range including both endpoints, e.g. "0-255" means 0 to 255, inclusive + or: "|" # Logical OR for alternatives, e.g. "a|b|c" means either a, b, or c + map: "=|" # Maps string values to numeric values, e.g. "Jan=1|Feb=2|..." maps month names to numbers + length: "{n}" # Constrains length, e.g. "{3}" means exactly 3 characters long + # Token encoding types are used to define the encoding of tokens. They correspond to the eventual message template argument types. + # IMPORTANT: When used in the schema section, they MUST be prefixed with a '%' character. + token_encoding_types: + A: "text" # Text value, retained as a string + I: "integer" # Integer numeric value + H: "hexadecimal" # Hexadecimal value, retained as a string + F: "floating point" # Floating point numeric value + 4: "IPv4" # IPv4 address + V: "IPv4 Address" # IPv4 and port combined + U: "UUID" # UUID value + T: "timestamp" # Timestamp value + # The timestamp component types are used to define the components of a timestamp in the schema section. + # These are used in subtokens that represent components of a timestamp and must be specified in the timestamp_component_type field + # for any subtoken that will be used in a timestamp format. + timestamp_component_types: + - Y: "year" # Year value (e.g., 2023) + - M: "month of the year" # Month value (1-12) + - D: "day of the month" # Day value (1-31) + - h: "hour" # Hour value (0-23 or 1-12 depending on format) + - AP: "AM/PM indicator" # AM should be assigned to the value "1" and PM to the value "2" + - m: "minute" # Minute value (0-59) + - s: "second" # Second value (0-59) + - ms: "millisecond" # Millisecond value (0-999) + - us: "microsecond" # Microsecond value (0-999999) + - ns: "nanosecond" # Nanosecond value + - TZh: "timezone offset in hours" # Timezone hour offset, e.g.: "-05", "+02" + - TZm: "timezone offset in minutes" # Timezone minute offset, e.g.: "-30", "+15" + - TZhm: "timezone offset in hours and minutes" # Combined timezone offset, e.g.: "-0500", "+0230" + - NA: "to ignore" # Used for sub-tokens that appear in a timestamp format but aren't used for computation + +# --------------------------------------------------- Schema section --------------------------------------------------- +# This section contains the actual schema rules for text parsing. +# Changes in this section will directly affect how the parser interprets text. +# The schema is built hierarchically, with each section depending on previous sections. + +# Special characters define how the parser should split and handle input text. +# Modifying these will directly affect how tokens and subtokens are extracted from text. +special_characters: + # characters that delimit tokens in the text (typically spaces and tabs) + token_delimiters: " \t" + # characters that delimit subTokens within a token (e.g., periods in IPv4 addresses) + sub_token_delimiters: |- + -+_:=/. + # characters that may be found in the boundaries of tokens, but are not considered part of them (e.g., brackets and quotes) + # these characters are not considered part of the token if it is identified as a template argument + token_boundary_characters: |- + {}[]();,'%" + +# The schema rules are built hierarchically, with each level building upon previous definitions. +# Rules defined earlier can be referenced by rules defined later. +# When referencing token encoding types, use the '%' prefix (e.g., %I for integer). +# When referencing defined subtokens or tokens, use the '$' prefix (e.g., $octet). +schema: + # SubToken base types define the character sets allowed in a subToken and its basic encoding. + # These serve as building blocks for more specific subToken types. + # Structure for each entry: + # name: descriptive name for this base type + # symbol: single character identifier used with '%' prefix in schema references + # encoding_type: the token encoding type (from token_encoding_types) this will use + # java_type: the Java data type that will represent values of this type + # characters: pattern defining which characters are valid (using pattern_syntax defined above) + # description: human-readable description of this type + sub_token_base_types: + - name: "unsigned_integer" + symbol: "I" + encoding_type: "%I" + java_type: "int" + characters: "[0-9]" + description: "An unsigned integer" + - name: "signed_integer" + symbol: "J" + encoding_type: "%I" + java_type: "int" + characters: "[\\-+0-9]" + description: "A signed integer" + - name: "double" + symbol: "F" + encoding_type: "%F" + java_type: "double" + characters: "[0-9.\\-+eE]" + description: "A floating point number" + - name: "alphabetic" + symbol: "A" + encoding_type: "%A" + java_type: "String" + characters: "[a-zA-Z]" + description: "An alphabetic string" + - name: "alphanumeric" + symbol: "N" + encoding_type: "%A" + java_type: "String" + characters: "[a-zA-Z0-9]" + description: "An alphanumeric string" + - name: "hexadecimal" + symbol: "X" + encoding_type: "%H" + java_type: "String" + characters: "[0-9a-fA-F]" + description: "A hexadecimal string" + + # SubToken types build upon base types by adding specific constraints. + # Structure for each entry: + # name: identifier for this subToken type (used with '$' prefix when referenced) + # base_type: reference to a subToken base type (using '%' prefix) + # constraint: rules that determine if a string qualifies as this subToken type + # Constraints can use operators from the operators section and can be combined: + # - Using "&&" for logical AND + # - Using "||" for logical OR + # - Using "!" for logical NOT + # - NOTE: When using negative numbers, wrap them in parentheses, e.g. (-1) + # - NOTE: Chained constraints don't support grouping with parentheses + # description: human-readable description + # timestamp_component_type: [OPTIONAL] specify if this subtoken represents a timestamp component + # Required only if this subtoken will be used in timestamp formats + sub_token_types: + - name: "octet" + base_type: "%I" + constraint: "0-255" + description: "An integer between 0 and 255, inclusive" + - name: "YYYY" + base_type: "%I" + constraint: "{4}" + description: "A year in 4-digit format" + timestamp_component_type: "Y" + - name: "MM" + base_type: "%I" + constraint: "1-12" + description: "A numeric representation of a month between 1 and 12, inclusive" + timestamp_component_type: "M" + - name: "DD" + base_type: "%I" + constraint: "1-31" + description: "Day of the month between 1 and 31, inclusive" + timestamp_component_type: "D" + - name: "hh" + base_type: "%I" + constraint: "0-23" + description: "Hour of the day, either in 24-hour or 12-hour format" + timestamp_component_type: "h" + - name: "AP" + base_type: "%A" + constraint: "AM=1|PM=2" + description: "AM/PM indicator, where AM is assigned to the value '1' and PM to the value '2'" + timestamp_component_type: "AP" + - name: "mm" + base_type: "%I" + constraint: "0-59" + description: "Minute of the hour between 0 and 59, inclusive" + timestamp_component_type: "m" + - name: "ss" + base_type: "%I" + constraint: "0-59" + description: "Second of the minute between 0 and 59, inclusive" + timestamp_component_type: "s" + - name: "ms" + base_type: "%I" + constraint: "{3}" + description: "Millisecond of the second between 0 and 999, inclusive" + timestamp_component_type: "ms" + - name: "us" + base_type: "%I" + constraint: "{6}" + description: "Microsecond of the second between 0 and 999999, inclusive" + timestamp_component_type: "us" + - name: "Mon" + base_type: "%A" + constraint: "Jan=1|Feb=2|Mar=3|Apr=4|May=5|Jun=6|Jul=7|Aug=8|Sep=9|Oct=10|Nov=11|Dec=12" + description: "3-letter abbreviation for the month of the year" + timestamp_component_type: "M" + - name: "Day" + base_type: "%A" + constraint: "Sun|Mon|Tue|Wed|Thu|Fri|Sat" + description: "3-letter abbreviation for the day of the week" + timestamp_component_type: "NA" + - name: "TZA" + base_type: "%A" + constraint: >- + UTC=0000|GMT=0000|EST=-0500|CST=-0600|MST=-0700|PST=-0800|EDT=-0400|CDT=-0500|MDT=-0600|PDT=-0700| + IST=0530|NPT=0545|JST=0900|AEST=1000|AEDT=1100|BST=0100|CET=0100|EET=0200|WET=0000|EEST=0300| + CEST=0200|AST=-0400|ADT=-0300|AKST=-0900|AKDT=-0800|HST=-1000|NST=-0330|NDT=-0230|NZST=1200| + NZDT=1300|SGT=0800|HKT=0800|KST=0900|WAT=0100|CAT=0200|EAT=0300|MSK=0300|PKT=0500|AFT=0430| + IRST=0330|IRDT=0430|ACST=0930|ACDT=1030|AWST=0800|AWDT=0900|ICT=0700|AZT=0400|AZST=0400|SAST=0200|Z=0000 + description: "Time zone abbreviation" + timestamp_component_type: "TZhm" + - name: "TZOhhmm" + base_type: "%J" + constraint: ">=(-1800) && <=1800" + description: "Time zone offset from UTC in hours and minutes conjugated as a signed integer (e.g. -0500, +0200)" + timestamp_component_type: "TZhm" + - name: "TZOhh" + base_type: "%J" + constraint: ">=(-14) && <=14" + description: "Time zone offset from UTC in two-digit hours (e.g. -05, +02)" + timestamp_component_type: "TZh" + - name: "TZOmm" + base_type: "%I" + constraint: ">=0 && <=59" + description: "Time zone offset from UTC in two-digit minutes (e.g. 30, 00)" + timestamp_component_type: "TZm" + - name: "port" + base_type: "%I" + constraint: "<=65535" + description: "A port number, which is an unsigned integer between 0 and 65535, inclusive" + + + # Token types define how to recognize and encode complete tokens (space/tab delimited elements). + # Structure for each entry: + # name: identifier for this token type (used with '$' prefix when referenced) + # encoding_type: reference to a token encoding type (using '%' prefix) + # special_sub_token_delimiters: [OPTIONAL] additional delimiters for this specific token type + # format: template for this token type, which can contain: + # - References to defined subToken types (using '$' prefix) + # - Ad-hoc subTokens defined inline using "(%BaseType constraint)" syntax + # - Delimiter characters that must match exactly + # Formats should be as specific as possible for efficient parsing + # description: human-readable description of this token type + token_types: + - name: "IPv4" + encoding_type: "%4" + format: "$octet.$octet.$octet.$octet" + description: "IPv4 address in dotted-decimal notation" + - name: "IPv4_with_port" + encoding_type: "%V" + format: "$octet.$octet.$octet.$octet:$port" + description: "IPv4 address in dotted-decimal notation with port" + - name: "UUID_standard" + encoding_type: "%U" + format: "(%X{8})-(%X{4})-(%X{4})-(%X{4})-(%X{12})" + description: "UUID in standard format, e.g. 123e4567-e89b-12d3-a456-426614174000" + - name: "UUID_compact" + encoding_type: "%U" + format: "(%X{32})" + description: "UUID in compact format, e.g. 123e4567e89b12d3a456426614174000" + - name: "DD" + encoding_type: "%I" + format: "$DD" + description: "Day of the month as an integer" + - name: "Day" + encoding_type: "%A" + format: "$Day" + description: "3-letter abbreviation for the day of the week" + - name: "Mon" + encoding_type: "%A" + format: "$Mon" + description: "3-letter abbreviation for the month of the year" + - name: "YYYY" + encoding_type: "%I" + format: "$YYYY" + description: "Year in 4-digit format" + - name: "AP" + encoding_type: "%A" + format: "$AP" + description: "AM/PM indicator, where AM is assigned to the value '1' and PM to the value '2'" + - name: "TZA" + encoding_type: "%A" + format: "$TZA" + description: "Time zone abbreviation, e.g. EST, GMT, UTC, etc." + - name: "TZOhhmm" + encoding_type: "%I" + format: "$TZOhhmm" + description: "Time zone offset from UTC in hours and minutes conjugated as a signed integer (e.g. -0500, +0200)" + - name: "date1" + encoding_type: "%A" + format: "$DD/$MM/$YYYY" + description: "Date without time, day first" + - name: "date2" + encoding_type: "%A" + format: "$YYYY-$MM-$DD" + description: "Date without time, year first" + - name: "timeS" + encoding_type: "%A" + format: "$hh:$mm:$ss" + description: "Time with second resolution" + - name: "timeMS" + encoding_type: "%A" + format: "$hh:$mm:$ss.$ms" + description: "Time with millisecond resolution" + - name: "timeUS" + encoding_type: "%A" + format: "$hh:$mm:$ss.$us" + description: "Time with microsecond resolution" + - name: "logging-libraries-datetime" + encoding_type: "%T" + # todo - special_sub_token_delimiters not supported yet + special_sub_token_delimiters: "T" + format: "$DD/$Mon/$YYYYT$hh:$mm:$ss" + description: "Date and time commonly used in logging library format" + - name: "apache-log-datetime" + encoding_type: "%T" + format: "$DD/$Mon/$YYYY:$hh:$mm:$ss" + description: "Date and time in Apache log format" + + # Multi-token types define patterns that span across multiple tokens (space/tab separated). + # These are used for complex patterns like timestamps that might include multiple tokens. + # Structure for each entry: + # name: identifier for this multi-token type + # encoding_type: reference to a token encoding type (using '%' prefix) + # format: template using token types and/or subToken types (all with '$' prefix) + # Each referenced element must be separated by spaces to indicate token boundaries + # description: human-readable description + multi_token_types: + - name: "RFC-1123-timestamp" + encoding_type: "%T" + # e.g Oct 05, 2023 02:48:00 PM + format: "$Mon, $DD $YYYY $timeS $AP" + description: "RFC 1123 formatted timestamp" + - name: "logging-libraries-datetime-timestamp" + encoding_type: "%T" + # e.g 05/Oct/2023T14:48:00 +0200 + format: "$logging-libraries-datetime $TZOhhmm" + description: "Logging libraries formatted timestamp" + - name: "apache-log-timestamp" + encoding_type: "%T" + # e.g 05/Oct/2023:14:48:00 +0200 + format: "$apache-log-datetime $TZOhhmm" + description: "Apache log formatted timestamp" + - name: "apache-error-log-timestamp" + encoding_type: "%T" + # e.g Thu Oct 05 14:48:00 2023 + format: "$Day $Mon $DD $timeMS $YYYY" + description: "Apache error log formatted timestamp" diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/HexadecimalArgumentTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/HexadecimalArgumentTests.java new file mode 100644 index 0000000000000..be4380f9ccfc1 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/HexadecimalArgumentTests.java @@ -0,0 +1,43 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.test.ESTestCase; + +public class HexadecimalArgumentTests extends ESTestCase { + + public void testEvenLengthHexString() { + HexadecimalArgument arg = new HexadecimalArgument("A1B2C3D4", 0, 8); + byte[] expected = { (byte) 0xA1, (byte) 0xB2, (byte) 0xC3, (byte) 0xD4 }; + assertArrayEquals(expected, arg.value()); + } + + public void testOddLengthHexString() { + HexadecimalArgument arg = new HexadecimalArgument("A1B2C3D4", 1, 7); + byte[] expected = { (byte) 0x1B, (byte) 0x2C, (byte) 0x3D, (byte) 0x04 }; + assertArrayEquals(expected, arg.value()); + } + + public void testSingleDigitHexString() { + HexadecimalArgument arg = new HexadecimalArgument("A1B2C3D4", 4, 1); + byte[] expected = { (byte) 0x0C }; + assertArrayEquals(expected, arg.value()); + } + + public void testEmptyHexString() { + HexadecimalArgument arg = new HexadecimalArgument("", 0, 0); + byte[] expected = {}; + assertArrayEquals(expected, arg.value()); + } + + public void testEncodeMethod() { + HexadecimalArgument arg = new HexadecimalArgument("A1B2C3D4", 0, 8); + String expected = "obLD1A"; // Base64 encoding of {0xA1, 0xB2, 0xC3, 0xD4} + assertEquals(expected, arg.encode()); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParserTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParserTests.java new file mode 100644 index 0000000000000..9c43f857a733c --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/api/ParserTests.java @@ -0,0 +1,142 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.api; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.TimestampFormat; + +import java.time.format.DateTimeFormatter; +import java.util.List; +import java.util.Locale; + +import static org.hamcrest.Matchers.instanceOf; + +public class ParserTests extends ESTestCase { + + private static Parser parser; + private static StringBuilder patternedMessage; + + @Override + public void setUp() throws Exception { + super.setUp(); + parser = ParserFactory.createParser(); + patternedMessage = new StringBuilder(); + } + + public void testSimpleIpAndNumber() throws ParseException { + String messageWithIpAndNumber = "Response from 127.0.0.1 took 2000 ms"; + List> parsedArguments = parser.parse(messageWithIpAndNumber); + Parser.constructPattern(messageWithIpAndNumber, parsedArguments, patternedMessage, true); + assertEquals("Response from %4 took %I ms", patternedMessage.toString()); + assertEquals(2, parsedArguments.size()); + assertEquals("IPV4", parsedArguments.getFirst().type().name()); + Argument argument = parsedArguments.get(1); + assertThat(argument, instanceOf(IntegerArgument.class)); + assertEquals(2000, ((IntegerArgument) argument).value().intValue()); + assertNull("Sign should be null", ((IntegerArgument) argument).sign()); + } + + public void testRFC1123TimestampAndIpAndNumber() throws ParseException { + String messageWithTimestampIpAndNumber = "Oct, 05 2023 02:48:07 PM INFO Response from 146.10.10.133 took 2000 ms"; + List> parsedArguments = parser.parse(messageWithTimestampIpAndNumber); + Parser.constructPattern(messageWithTimestampIpAndNumber, parsedArguments, patternedMessage, true); + assertEquals("%T INFO Response from %4 took %I ms", patternedMessage.toString()); + assertEquals(3, parsedArguments.size()); + assertThat(parsedArguments.getFirst(), instanceOf(Timestamp.class)); + Timestamp timestamp = (Timestamp) parsedArguments.getFirst(); + assertEquals(1696517287000L, timestamp.getTimestampMillis()); + String pattern = timestamp.getFormat(); + assertEquals("MMM, dd yyyy hh:mm:ss a", pattern); + DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern(pattern, Locale.US); + assertEquals(1696517287000L, TimestampFormat.parseTimestamp(dateTimeFormatter, "Oct, 05 2023 02:48:07 PM")); + assertEquals("IPV4", parsedArguments.get(1).type().name()); + assertEquals("INTEGER", parsedArguments.get(2).type().name()); + } + + public void testInvalidTimestamp() throws ParseException { + String message = "Oct 05 2023 02:48:07 PM INFO Response from 146.10.10.133 took 2000 ms"; + List> parsedArguments = parser.parse(message); + Parser.constructPattern(message, parsedArguments, patternedMessage, true); + // todo - add support for local time - based on java.time.LocalTime + assertEquals("Oct %I %I %I:%I:%I PM INFO Response from %4 took %I ms", patternedMessage.toString()); + } + + public void testNumberArgumentsWithSign() throws ParseException { + String message = "-5 is negative, this:+10:-8 is both and this is positive: +20"; + List> parsedArguments = parser.parse(message); + Parser.constructPattern(message, parsedArguments, patternedMessage, true); + assertEquals("%I is negative, this:%I:%I is both and this is positive: %I", patternedMessage.toString()); + assertEquals(4, parsedArguments.size()); + Argument argument = parsedArguments.getFirst(); + assertThat(argument, instanceOf(IntegerArgument.class)); + assertEquals(Sign.MINUS, ((IntegerArgument) argument).sign()); + assertEquals(-5, ((IntegerArgument) argument).value().intValue()); + argument = parsedArguments.get(1); + assertThat(argument, instanceOf(IntegerArgument.class)); + assertEquals(Sign.PLUS, ((IntegerArgument) argument).sign()); + assertEquals(10, ((IntegerArgument) argument).value().intValue()); + argument = parsedArguments.get(2); + assertThat(argument, instanceOf(IntegerArgument.class)); + assertEquals(Sign.MINUS, ((IntegerArgument) argument).sign()); + assertEquals(-8, ((IntegerArgument) argument).value().intValue()); + argument = parsedArguments.get(3); + assertThat(argument, instanceOf(IntegerArgument.class)); + assertEquals(Sign.PLUS, ((IntegerArgument) argument).sign()); + assertEquals(20, ((IntegerArgument) argument).value().intValue()); + } + + public void testFloatingPointArguments() throws ParseException { + String message = "-5.08 is at the beginning, and here is one at the end: -1.09e-2"; + List> parsedArguments = parser.parse(message); + Parser.constructPattern(message, parsedArguments, patternedMessage, true); + assertEquals("%F is at the beginning, and here is one at the end: %F", patternedMessage.toString()); + assertEquals(2, parsedArguments.size()); + Argument argument = parsedArguments.getFirst(); + assertThat(argument, instanceOf(DoubleArgument.class)); + assertEquals(-5.08, ((DoubleArgument) argument).value(), 0); + argument = parsedArguments.get(1); + assertThat(argument, instanceOf(DoubleArgument.class)); + assertEquals(-0.0109, ((DoubleArgument) argument).value(), 0); + } + + public void testBigIntegerArgument() throws ParseException { + String message = "The value is 123456789 in the message"; + List> parsedArguments = parser.parse(message); + Parser.constructPattern(message, parsedArguments, patternedMessage, true); + assertEquals("The value is %I in the message", patternedMessage.toString()); + assertEquals(1, parsedArguments.size()); + Argument argument = parsedArguments.getFirst(); + assertThat(argument, instanceOf(IntegerArgument.class)); + assertEquals(123456789, ((IntegerArgument) argument).value().intValue()); + } + + public void testApacheLogTimestamp() throws ParseException { + String message = "05/Oct/2023:14:48:00 +0000 GET /index.html 200"; + List> parsedArguments = parser.parse(message); + Parser.constructPattern(message, parsedArguments, patternedMessage, true); + assertEquals("%T GET /index.html %I", patternedMessage.toString()); + assertEquals(2, parsedArguments.size()); + Argument argument = parsedArguments.getFirst(); + assertThat(argument, instanceOf(Timestamp.class)); + Timestamp timestamp = (Timestamp) argument; + assertEquals(1696517280000L, timestamp.getTimestampMillis()); + String pattern = timestamp.getFormat(); + assertEquals("dd/MMM/yyyy:HH:mm:ss Z", pattern); + DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern(pattern, Locale.US); + assertEquals(1696517280000L, TimestampFormat.parseTimestamp(dateTimeFormatter, "05/Oct/2023:14:48:00 +0000")); + argument = parsedArguments.get(1); + assertThat(argument, instanceOf(IntegerArgument.class)); + assertEquals(200, ((IntegerArgument) argument).value().intValue()); + } + + public void testApacheErrorLogTimestamp() throws ParseException { + String message = "[Thu Oct 05 14:48:00.123 2023] [info] [pid 9] core.c(4739): [client 172.17.0.1:50764] AH00128: File does not " + + "exist: /usr/local/apache2/htdocs/favicon.ico."; + // todo - timestamp with NA component (day of week) not yet supported as well as IP4V address + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SchemaCompilerTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SchemaCompilerTests.java new file mode 100644 index 0000000000000..20fdb2eea7ec0 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SchemaCompilerTests.java @@ -0,0 +1,453 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.TimestampComponentType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.BitmaskRegistry; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.CharSpecificParsingInfo; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.MultiTokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubTokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringToIntegerMap; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringView; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.TimestampFormat; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.TokenType; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.MultiTokenFormat; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.PatternUtils; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.Schema; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.ToIntFunction; + +public class SchemaCompilerTests extends ESTestCase { + + public void testMergeIntRangeBitmasks_OverlappingRanges() { + ArrayList input = new ArrayList<>(); + input.add(SchemaCompiler.IntRangeBitmask.of(10, 20, 0x01)); + input.add(SchemaCompiler.IntRangeBitmask.of(15, 20, 0x02)); + input.add(SchemaCompiler.IntRangeBitmask.of(20, 30, 0x04)); + input.add(SchemaCompiler.IntRangeBitmask.of(25, 30, 0x08)); + + ArrayList expected = new ArrayList<>(); + expected.add(SchemaCompiler.IntRangeBitmask.of(Integer.MIN_VALUE, 9, 0x00)); + expected.add(SchemaCompiler.IntRangeBitmask.of(10, 14, 0x01)); + expected.add(SchemaCompiler.IntRangeBitmask.of(15, 19, 0x03)); + expected.add(SchemaCompiler.IntRangeBitmask.of(20, 20, 0x07)); + expected.add(SchemaCompiler.IntRangeBitmask.of(21, 24, 0x04)); + expected.add(SchemaCompiler.IntRangeBitmask.of(25, 30, 0x0C)); + expected.add(SchemaCompiler.IntRangeBitmask.of(31, Integer.MAX_VALUE, 0x00)); + + assertEquals(expected, SchemaCompiler.mergeIntRangeBitmasks(input)); + } + + public void testMergeIntRangeBitmasks_GapsBetweenRanges() { + ArrayList input = new ArrayList<>(); + input.add(SchemaCompiler.IntRangeBitmask.of(10, 15, 0x01)); + input.add(SchemaCompiler.IntRangeBitmask.of(20, 25, 0x02)); + + ArrayList expected = new ArrayList<>(); + expected.add(SchemaCompiler.IntRangeBitmask.of(Integer.MIN_VALUE, 9, 0x00)); + expected.add(SchemaCompiler.IntRangeBitmask.of(10, 15, 0x01)); + expected.add(SchemaCompiler.IntRangeBitmask.of(16, 19, 0x00)); + expected.add(SchemaCompiler.IntRangeBitmask.of(20, 25, 0x02)); + expected.add(SchemaCompiler.IntRangeBitmask.of(26, Integer.MAX_VALUE, 0x00)); + + assertEquals(expected, SchemaCompiler.mergeIntRangeBitmasks(input)); + } + + public void testMergeIntRangeBitmasks_SingleIntegerRanges() { + ArrayList input = new ArrayList<>(); + input.add(SchemaCompiler.IntRangeBitmask.of(10, 15, 0x01)); + input.add(SchemaCompiler.IntRangeBitmask.of(15, 15, 0x02)); + input.add(SchemaCompiler.IntRangeBitmask.of(0, 30, 0x04)); + + ArrayList expected = new ArrayList<>(); + expected.add(SchemaCompiler.IntRangeBitmask.of(Integer.MIN_VALUE, -1, 0x00)); + expected.add(SchemaCompiler.IntRangeBitmask.of(0, 9, 0x04)); + expected.add(SchemaCompiler.IntRangeBitmask.of(10, 14, 0x05)); + expected.add(SchemaCompiler.IntRangeBitmask.of(15, 15, 0x07)); + expected.add(SchemaCompiler.IntRangeBitmask.of(16, 30, 0x04)); + expected.add(SchemaCompiler.IntRangeBitmask.of(31, Integer.MAX_VALUE, 0x00)); + + assertEquals(expected, SchemaCompiler.mergeIntRangeBitmasks(input)); + } + + public void testMergeIntRangeBitmasks_IntegerMinMax() { + ArrayList input = new ArrayList<>(); + input.add(SchemaCompiler.IntRangeBitmask.of(Integer.MIN_VALUE, 10, 0x01)); + input.add(SchemaCompiler.IntRangeBitmask.of(0, Integer.MAX_VALUE, 0x02)); + + ArrayList expected = new ArrayList<>(); + expected.add(SchemaCompiler.IntRangeBitmask.of(Integer.MIN_VALUE, -1, 0x01)); + expected.add(SchemaCompiler.IntRangeBitmask.of(0, 10, 0x03)); + expected.add(SchemaCompiler.IntRangeBitmask.of(11, Integer.MAX_VALUE, 0x02)); + + assertEquals(expected, SchemaCompiler.mergeIntRangeBitmasks(input)); + } + + public void testMergeIntRangeBitmasks_ComplexCase() { + ArrayList input = new ArrayList<>(); + input.add(SchemaCompiler.IntRangeBitmask.of(10, 20, 0x01)); + input.add(SchemaCompiler.IntRangeBitmask.of(15, 25, 0x02)); + input.add(SchemaCompiler.IntRangeBitmask.of(20, 30, 0x04)); + input.add(SchemaCompiler.IntRangeBitmask.of(25, 35, 0x08)); + input.add(SchemaCompiler.IntRangeBitmask.of(22, 22, 0x10)); + input.add(SchemaCompiler.IntRangeBitmask.of(Integer.MIN_VALUE, 100, 0x20)); + input.add(SchemaCompiler.IntRangeBitmask.of(-50, -30, 0x40)); + input.add(SchemaCompiler.IntRangeBitmask.of(500, 2000, 0x80)); + + ArrayList expected = new ArrayList<>(); + expected.add(SchemaCompiler.IntRangeBitmask.of(Integer.MIN_VALUE, -51, 0x20)); + expected.add(SchemaCompiler.IntRangeBitmask.of(-50, -30, 0x60)); + expected.add(SchemaCompiler.IntRangeBitmask.of(-29, 9, 0x20)); + expected.add(SchemaCompiler.IntRangeBitmask.of(10, 14, 0x21)); + expected.add(SchemaCompiler.IntRangeBitmask.of(15, 19, 0x23)); + expected.add(SchemaCompiler.IntRangeBitmask.of(20, 20, 0x27)); + expected.add(SchemaCompiler.IntRangeBitmask.of(21, 21, 0x26)); + expected.add(SchemaCompiler.IntRangeBitmask.of(22, 22, 0x36)); + expected.add(SchemaCompiler.IntRangeBitmask.of(23, 24, 0x26)); + expected.add(SchemaCompiler.IntRangeBitmask.of(25, 25, 0x2E)); + expected.add(SchemaCompiler.IntRangeBitmask.of(26, 30, 0x2C)); + expected.add(SchemaCompiler.IntRangeBitmask.of(31, 35, 0x28)); + expected.add(SchemaCompiler.IntRangeBitmask.of(36, 100, 0x20)); + expected.add(SchemaCompiler.IntRangeBitmask.of(101, 499, 0x00)); + expected.add(SchemaCompiler.IntRangeBitmask.of(500, 2000, 0x80)); + expected.add(SchemaCompiler.IntRangeBitmask.of(2001, Integer.MAX_VALUE, 0x00)); + + assertExpectedIntRangeBitmaskLists(expected, SchemaCompiler.mergeIntRangeBitmasks(input)); + } + + private static void assertExpectedIntRangeBitmaskLists( + ArrayList expected, + ArrayList actual + ) { + if (expected.equals(actual)) { + return; + } + for (int i = 0; i < expected.size(); i++) { + SchemaCompiler.IntRangeBitmask expectedRange = expected.get(i); + if (i < actual.size()) { + SchemaCompiler.IntRangeBitmask actualRange = actual.get(i); + assertEquals("Expected " + expectedRange + " at index " + i + ", but found " + actualRange, expectedRange, actualRange); + } else { + fail("Expected to find " + expectedRange + " at index " + i + ", but actual list is shorter."); + } + } + } + + public void testSchemaCompilation() { + Schema schema = Schema.getInstance(); + CompiledSchema compiledSchema = SchemaCompiler.compile(schema); + + assertNotNull(compiledSchema); + + BitmaskRegistry subTokenBitmaskRegistry = compiledSchema.subTokenBitmaskRegistry; + + subTokenBitmaskRegistry.getAllRegisteredTypes().forEach(subTokenType -> { + try { + subTokenType.getHigherLevelBitmaskByPosition(compiledSchema.maxSubTokensPerToken - 1); + } catch (Exception e) { + fail("Should not have thrown exception: " + e.getMessage()); + } + }); + + int int_bitmask = subTokenBitmaskRegistry.getBitmask("integer"); + int double_bitmask = subTokenBitmaskRegistry.getBitmask("double"); + int hex_bitmask = subTokenBitmaskRegistry.getBitmask("hex"); + int MM_bitmask = subTokenBitmaskRegistry.getBitmask("MM"); + int DD_bitmask = subTokenBitmaskRegistry.getBitmask("DD"); + int YYYY_bitmask = subTokenBitmaskRegistry.getBitmask("YYYY"); + int hh_bitmask = subTokenBitmaskRegistry.getBitmask("hh"); + int mm_bitmask = subTokenBitmaskRegistry.getBitmask("mm"); + int ss_bitmask = subTokenBitmaskRegistry.getBitmask("ss"); + int TZOhhmm_bitmask = subTokenBitmaskRegistry.getBitmask("TZOhhmm"); + int Mon_subToken_bitmask = subTokenBitmaskRegistry.getBitmask("Mon"); + int Day_bitmask = subTokenBitmaskRegistry.getBitmask("Day"); + int octet_bitmask = subTokenBitmaskRegistry.getBitmask("octet"); + + int allIntegerSubTokenBitmask = compiledSchema.allIntegerSubTokenBitmask; + assertNotEquals(0x00, allIntegerSubTokenBitmask & MM_bitmask); + assertNotEquals(0x00, allIntegerSubTokenBitmask & DD_bitmask); + assertNotEquals(0x00, allIntegerSubTokenBitmask & YYYY_bitmask); + assertNotEquals(0x00, allIntegerSubTokenBitmask & hh_bitmask); + assertNotEquals(0x00, allIntegerSubTokenBitmask & mm_bitmask); + assertNotEquals(0x00, allIntegerSubTokenBitmask & ss_bitmask); + assertNotEquals(0x00, allIntegerSubTokenBitmask & TZOhhmm_bitmask); + assertEquals(0x00, allIntegerSubTokenBitmask & Mon_subToken_bitmask); + assertEquals(0x00, allIntegerSubTokenBitmask & Day_bitmask); + assertNotEquals(0x00, allIntegerSubTokenBitmask & octet_bitmask); + + assertEquals(int_bitmask, compiledSchema.intSubTokenBitmask); + assertEquals(int_bitmask | double_bitmask | hex_bitmask, compiledSchema.genericSubTokenTypesBitmask); + + int[] charToSubTokenBitmask = compiledSchema.charToSubTokenBitmask; + int charSubTokenBitmask = charToSubTokenBitmask['M']; + assertEquals(0x00, charSubTokenBitmask & MM_bitmask); + assertNotEquals(0x00, charSubTokenBitmask & Mon_subToken_bitmask); + assertEquals(0x00, charSubTokenBitmask & DD_bitmask); + assertNotEquals(0x00, charSubTokenBitmask & Day_bitmask); + assertEquals(0x00, charSubTokenBitmask & int_bitmask); + assertEquals(0x00, charSubTokenBitmask & hex_bitmask); + + charSubTokenBitmask = charToSubTokenBitmask['3']; + assertNotEquals(0x00, charSubTokenBitmask & MM_bitmask); + assertEquals(0x00, charSubTokenBitmask & Mon_subToken_bitmask); + assertNotEquals(0x00, charSubTokenBitmask & DD_bitmask); + assertNotEquals(0x00, charSubTokenBitmask & int_bitmask); + assertNotEquals(0x00, charSubTokenBitmask & double_bitmask); + assertNotEquals(0x00, charSubTokenBitmask & hex_bitmask); + assertEquals(0x00, charSubTokenBitmask & Day_bitmask); + + byte[] charToCharType = compiledSchema.charToCharType; + assertEquals(CharCodes.ALPHABETIC_CHAR_CODE, charToCharType['M']); + assertEquals(CharCodes.DIGIT_CHAR_CODE, charToCharType['3']); + assertEquals(CharCodes.SUBTOKEN_DELIMITER_CHAR_CODE, charToCharType['/']); + assertEquals(CharCodes.SUBTOKEN_DELIMITER_CHAR_CODE, charToCharType[':']); + assertEquals(CharCodes.TOKEN_DELIMITER_CHAR_CODE, charToCharType[' ']); + + assertEquals(6, compiledSchema.maxSubTokensPerToken); + assertEquals(5, compiledSchema.maxTokensPerMultiToken); + + int[] smallIntegerSubTokenBitmasks = compiledSchema.smallIntegerSubTokenBitmasks; + int bitmask = smallIntegerSubTokenBitmasks[22]; + assertNotEquals("DD bitmask should be set for 22", 0x00, bitmask & DD_bitmask); + assertEquals("MM bitmask should not be set for 22", 0x00, bitmask & MM_bitmask); + assertEquals("YYYY bitmask should not be set for 22", 0x00, bitmask & YYYY_bitmask); + assertNotEquals("hh bitmask should be set for 22", 0x00, bitmask & hh_bitmask); + assertNotEquals("mm bitmask should be set for 22", 0x00, bitmask & mm_bitmask); + assertNotEquals("ss bitmask should be set for 22", 0x00, bitmask & ss_bitmask); + assertNotEquals("int_bitmask should be set for 22", 0x00, bitmask & int_bitmask); + + bitmask = smallIntegerSubTokenBitmasks[50]; + assertEquals("DD bitmask should not be set for 50", 0x00, bitmask & DD_bitmask); + assertEquals("DD bitmask should not be set for 50", 0x00, bitmask & MM_bitmask); + assertEquals("YYYY bitmask should not be set for 50", 0x00, bitmask & YYYY_bitmask); + assertEquals("hh bitmask should not be set for 50", 0x00, bitmask & hh_bitmask); + assertNotEquals("mm bitmask should be set for 50", 0x00, bitmask & mm_bitmask); + assertNotEquals("ss bitmask should be set for 50", 0x00, bitmask & ss_bitmask); + assertNotEquals("int_bitmask should be set for 50", 0x00, bitmask & int_bitmask); + + int bitmaskForInteger = getBitmaskForInteger(2000, compiledSchema); + assertNotEquals(0x00, bitmaskForInteger & YYYY_bitmask); + assertEquals(0x00, bitmaskForInteger & TZOhhmm_bitmask); + assertNotEquals("int_bitmask should be set for 2000", 0x00, bitmaskForInteger & int_bitmask); + + CharSpecificParsingInfo[] charSpecificParsingInfos = compiledSchema.charSpecificParsingInfos; + assertNotNull(charSpecificParsingInfos); + CharSpecificParsingInfo spaceDelimiterInfo = charSpecificParsingInfos[' ']; + + BitmaskRegistry tokenBitmaskRegistry = compiledSchema.tokenBitmaskRegistry; + + tokenBitmaskRegistry.getAllRegisteredTypes().forEach(tokenType -> { + try { + tokenType.getHigherLevelBitmaskByPosition(compiledSchema.maxTokensPerMultiToken - 1); + } catch (Exception e) { + fail("Should not have thrown exception: " + e.getMessage()); + } + }); + + int ipv4_bitmask = tokenBitmaskRegistry.getBitmask("IPv4"); + int uuid_bitmask = tokenBitmaskRegistry.getBitmask("UUID_standard"); + int Mon_token_bitmask = tokenBitmaskRegistry.getBitmask("Mon"); + + CharSpecificParsingInfo dotDelimiterInfo = charSpecificParsingInfos['.']; + int[] tokenBitmaskPerSubTokenIndex_dot = dotDelimiterInfo.tokenBitmaskPerDelimiterPosition; + CharSpecificParsingInfo dashDelimiterInfo = charSpecificParsingInfos['-']; + int[] tokenBitmaskPerSubTokenIndex_dash = dashDelimiterInfo.tokenBitmaskPerDelimiterPosition; + int[] tokenBitmaskPerSubTokenIndex_space = spaceDelimiterInfo.tokenBitmaskPerDelimiterPosition; + + // IPv4 + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[0] & ipv4_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_space[0] & ipv4_bitmask); + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[1] & ipv4_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_space[1] & ipv4_bitmask); + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[2] & ipv4_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_space[2] & ipv4_bitmask); + // the fourth sub-token in IPv4 should be tested against the token (not sub-token) delimiter info because it is the last sub-token + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[3] & ipv4_bitmask); + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_space[3] & ipv4_bitmask); + + // UUID + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[0] & uuid_bitmask); + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_dash[0] & uuid_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[1] & uuid_bitmask); + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_dash[1] & uuid_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[2] & uuid_bitmask); + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_dash[2] & uuid_bitmask); + + // UUID standard format is defined in the schema as: "(%X{8})-(%X{4})-(%X{4})-(%X{4})-(%X{12})" + int tokenBitmaskForUUID = tokenBitmaskRegistry.getCombinedBitmask(); + String testUuid = "123e4567-e89b-12d3-a456-426614174000"; + ToIntFunction[] dashSubTokenBitmaskGeneratorPerIndex = dashDelimiterInfo.bitmaskGeneratorPerPosition; + assertNotNull(dashSubTokenBitmaskGeneratorPerIndex); + ToIntFunction subTokenBitmaskGenerator = dashSubTokenBitmaskGeneratorPerIndex[0]; + assertNotNull(subTokenBitmaskGenerator); + int firstSubTokenBitmask = subTokenBitmaskRegistry.getBitmask( + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType.ADHOC_PREFIX + "%X{8}" + ); + assertNotEquals(0x00, firstSubTokenBitmask); + assertEquals(0, subTokenBitmaskGenerator.applyAsInt(new SubstringView(testUuid, 0, 7))); + SubstringView testedSubstring = new SubstringView(testUuid, 0, 8); + assertTrue(subTokenBitmaskGenerator.applyAsInt(testedSubstring) > 0); + assertNotEquals(0x00, subTokenBitmaskGenerator.applyAsInt(testedSubstring) & firstSubTokenBitmask); + tokenBitmaskForUUID &= subTokenBitmaskRegistry.getHigherLevelBitmaskByPosition(firstSubTokenBitmask, 0); + subTokenBitmaskGenerator = dashSubTokenBitmaskGeneratorPerIndex[1]; + assertNotNull(subTokenBitmaskGenerator); + int middleSubTokensBitmask = subTokenBitmaskRegistry.getBitmask( + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType.ADHOC_PREFIX + "%X{4}" + ); + assertEquals(0, subTokenBitmaskGenerator.applyAsInt(new SubstringView(testUuid, 9, 12))); + testedSubstring = new SubstringView(testUuid, 9, 13); + assertTrue(subTokenBitmaskGenerator.applyAsInt(testedSubstring) > 0); + assertNotEquals(0x00, subTokenBitmaskGenerator.applyAsInt(testedSubstring) & middleSubTokensBitmask); + tokenBitmaskForUUID &= subTokenBitmaskRegistry.getHigherLevelBitmaskByPosition(middleSubTokensBitmask, 1); + subTokenBitmaskGenerator = dashSubTokenBitmaskGeneratorPerIndex[2]; + assertNotNull(subTokenBitmaskGenerator); + assertEquals(0, subTokenBitmaskGenerator.applyAsInt(new SubstringView(testUuid, 14, 17))); + testedSubstring = new SubstringView(testUuid, 14, 18); + assertTrue(subTokenBitmaskGenerator.applyAsInt(testedSubstring) > 0); + assertNotEquals(0x00, subTokenBitmaskGenerator.applyAsInt(testedSubstring) & middleSubTokensBitmask); + tokenBitmaskForUUID &= subTokenBitmaskRegistry.getHigherLevelBitmaskByPosition(middleSubTokensBitmask, 2); + subTokenBitmaskGenerator = dashSubTokenBitmaskGeneratorPerIndex[3]; + assertNotNull(subTokenBitmaskGenerator); + assertEquals(0, subTokenBitmaskGenerator.applyAsInt(new SubstringView(testUuid, 19, 22))); + testedSubstring = new SubstringView(testUuid, 19, 23); + assertTrue(subTokenBitmaskGenerator.applyAsInt(testedSubstring) > 0); + assertNotEquals(0x00, subTokenBitmaskGenerator.applyAsInt(testedSubstring) & middleSubTokensBitmask); + tokenBitmaskForUUID &= subTokenBitmaskRegistry.getHigherLevelBitmaskByPosition(middleSubTokensBitmask, 3); + // the fifth sub-token of a UUID should be tested against the token (not sub-token) delimiter info because it is the last sub-token + int lastSubTokenBitmask = subTokenBitmaskRegistry.getBitmask( + org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.SubTokenType.ADHOC_PREFIX + "%X{12}" + ); + subTokenBitmaskGenerator = dashSubTokenBitmaskGeneratorPerIndex[4]; + assertNull(subTokenBitmaskGenerator); + subTokenBitmaskGenerator = spaceDelimiterInfo.bitmaskGeneratorPerPosition[4]; + assertNotNull(subTokenBitmaskGenerator); + assertEquals(0, subTokenBitmaskGenerator.applyAsInt(new SubstringView(testUuid, 24, 35))); + testedSubstring = new SubstringView(testUuid, 24, 36); + assertTrue(subTokenBitmaskGenerator.applyAsInt(testedSubstring) > 0); + assertNotEquals(0x00, subTokenBitmaskGenerator.applyAsInt(testedSubstring) & lastSubTokenBitmask); + tokenBitmaskForUUID &= subTokenBitmaskRegistry.getHigherLevelBitmaskByPosition(lastSubTokenBitmask, 4); + tokenBitmaskForUUID &= compiledSchema.subTokenCountToTokenBitmask[4]; + assertEquals("The combined UUID token bitmask should match the expected UUID bitmask", uuid_bitmask, tokenBitmaskForUUID); + + SubstringToIntegerMap subTokenNumericValueRepresentation = compiledSchema.subTokenNumericValueRepresentation; + assertEquals(3, subTokenNumericValueRepresentation.applyAsInt(new SubstringView("Mar"))); + assertEquals(0, subTokenNumericValueRepresentation.applyAsInt(new SubstringView("Mac"))); + assertEquals(200, subTokenNumericValueRepresentation.applyAsInt(new SubstringView("CEST"))); + + // Mon + assertNotEquals(0x00, tokenBitmaskPerSubTokenIndex_space[0] & Mon_token_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[0] & Mon_token_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dash[0] & Mon_token_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_space[1] & Mon_token_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dot[1] & Mon_token_bitmask); + assertEquals(0x00, tokenBitmaskPerSubTokenIndex_dash[1] & Mon_token_bitmask); + + int[] subTokenCountToTokenBitmask = compiledSchema.subTokenCountToTokenBitmask; + assertEquals(0x00, subTokenCountToTokenBitmask[2] & ipv4_bitmask); + assertNotEquals(0x00, subTokenCountToTokenBitmask[3] & ipv4_bitmask); + assertEquals(0x00, subTokenCountToTokenBitmask[4] & ipv4_bitmask); + assertEquals(0x00, subTokenCountToTokenBitmask[3] & uuid_bitmask); + assertNotEquals(0x00, subTokenCountToTokenBitmask[4] & uuid_bitmask); + assertEquals(0x00, subTokenCountToTokenBitmask[5] & uuid_bitmask); + + assertEquals(8, compiledSchema.maxSubTokensPerMultiToken); + BitmaskRegistry multiTokenBitmaskRegistry = compiledSchema.multiTokenBitmaskRegistry; + int rfc_1123_timestamp_bitmask = multiTokenBitmaskRegistry.getBitmask("RFC-1123-timestamp"); + MultiTokenType timestamp1Type = multiTokenBitmaskRegistry.getHighestPriorityType(rfc_1123_timestamp_bitmask); + assertEquals(7, timestamp1Type.getNumSubTokens()); + TimestampFormat rfc1123TimestampFormat = timestamp1Type.getTimestampFormat(); + assertEquals(7, rfc1123TimestampFormat.getNumTimestampComponents()); + int[] timestampComponentsOrder = rfc1123TimestampFormat.getTimestampComponentsOrder(); + assertEquals(TimestampComponentType.values().length, timestampComponentsOrder.length); + // RFC-1123-timestamp type format is: "$Mon, $DD $YYYY $timeS $AP" + assertEquals("MMM, dd yyyy hh:mm:ss a", rfc1123TimestampFormat.getJavaTimeFormat()); + assertEquals(0, timestampComponentsOrder[TimestampComponentType.MONTH_CODE]); + assertEquals(1, timestampComponentsOrder[TimestampComponentType.DAY_CODE]); + assertEquals(2, timestampComponentsOrder[TimestampComponentType.YEAR_CODE]); + assertEquals(3, timestampComponentsOrder[TimestampComponentType.HOUR_CODE]); + assertEquals(4, timestampComponentsOrder[TimestampComponentType.MINUTE_CODE]); + assertEquals(5, timestampComponentsOrder[TimestampComponentType.SECOND_CODE]); + assertEquals(6, timestampComponentsOrder[TimestampComponentType.AM_PM_CODE]); + } + + public void testCreateTimestampFormat_withBracketLiterals() { + Schema schema = Schema.getInstance(); + String rawFormat = "$date2 {$timeMS} $TZOhhmm"; + List formatTokens = new ArrayList<>(); + List formatDelimiterParts = new ArrayList<>(); + PatternUtils.parseMultiTokenFormat( + rawFormat, + schema.getTokenTypes(), + schema.getAllTokenBoundaryChars(), + formatTokens, + formatDelimiterParts + ); + MultiTokenFormat multiTokenFormat = new MultiTokenFormat(rawFormat, formatDelimiterParts, formatTokens); + + TimestampFormat result = SchemaCompiler.createTimestampFormat(multiTokenFormat); + + // bracket literals should be escaped in the Java time format; double-space should be preserved + assertEquals("yyyy-MM-dd '{'HH:mm:ss.SSS'}' Z", result.getJavaTimeFormat()); + int[] order = result.getTimestampComponentsOrder(); + assertEquals(0, order[TimestampComponentType.YEAR_CODE]); + assertEquals(1, order[TimestampComponentType.MONTH_CODE]); + assertEquals(2, order[TimestampComponentType.DAY_CODE]); + assertEquals(3, order[TimestampComponentType.HOUR_CODE]); + assertEquals(4, order[TimestampComponentType.MINUTE_CODE]); + assertEquals(5, order[TimestampComponentType.SECOND_CODE]); + assertEquals(6, order[TimestampComponentType.MILLISECOND_CODE]); + } + + private static int getBitmaskForInteger(int value, CompiledSchema compiledSchema) { + int[] integerSubTokenBitmaskArrayRanges = compiledSchema.integerSubTokenBitmaskArrayRanges; + int[] integerSubTokenBitmasks = compiledSchema.integerSubTokenBitmasks; + for (int i = 0; i < integerSubTokenBitmaskArrayRanges.length; i++) { + if (value <= integerSubTokenBitmaskArrayRanges[i]) { + if (i == integerSubTokenBitmaskArrayRanges.length - 1) { + throw new IllegalArgumentException("Value " + value + " exceeds maximum range defined in schema."); + } + return integerSubTokenBitmasks[i]; + } + } + throw new IllegalArgumentException("Value " + value + " is below minimum range defined in schema."); + } + + public void testSpecialCharacters() { + Schema schema = Schema.getInstance(); + CompiledSchema compiledSchema = SchemaCompiler.compile(schema); + assertNotNull(compiledSchema); + CharSpecificParsingInfo[] charSpecificParsingInfos = compiledSchema.charSpecificParsingInfos; + for (char c : schema.getTokenDelimiters()) { + CharSpecificParsingInfo info = charSpecificParsingInfos[c]; + assertNotNull("CharSpecificParsingInfo should be defined for token delimiter char: '" + c + "'", info); + assertNotNull(info.tokenBitmaskPerDelimiterPosition); + assertNotNull(info.bitmaskGeneratorPerPosition); + if (c == ' ') { + assertNotNull(info.multiTokenBitmaskPerDelimiterPartPosition); + } + } + for (char c : "./:-".toCharArray()) { + CharSpecificParsingInfo info = charSpecificParsingInfos[c]; + assertNotNull("CharSpecificParsingInfo should be defined for sub-token delimiter char: '" + c + "'", info); + assertNotNull("info.bitmaskPerPosition is null for char: " + c, info.tokenBitmaskPerDelimiterPosition); + if (c == '-') { + assertNotNull("info.bitmaskPerPosition is null for char: " + c, info.bitmaskGeneratorPerPosition); + } + assertNull(info.multiTokenBitmaskPerDelimiterPartPosition); + } + CharSpecificParsingInfo info = charSpecificParsingInfos[',']; + assertNotNull("CharSpecificParsingInfo should be defined for token boundary char: ','", info); + assertNull(info.tokenBitmaskPerDelimiterPosition); + assertNull(info.bitmaskGeneratorPerPosition); + assertNotNull(info.multiTokenBitmaskPerDelimiterPartPosition); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskChainTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskChainTests.java new file mode 100644 index 0000000000000..6f8728cfd68a6 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskChainTests.java @@ -0,0 +1,350 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringToIntegerMap; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringView; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AndStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AnyString; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.EqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.LengthStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.NotEqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.OrStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringSetConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringToIntMapConstraint; + +import java.util.Map; +import java.util.Set; +import java.util.function.ToIntFunction; + +public class SubstringToBitmaskChainTests extends ESTestCase { + + // Single constraint tests + + public void testChain_SingleMapConstraint() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new StringToIntMapConstraint(Map.of("Jan", 1, "Feb", 2)), 0x01); + + ToIntFunction chain = builder.build(); + + // Should optimize to just a map + assertTrue(chain instanceof SubstringToIntegerMap); + + assertEquals(0x01, chain.applyAsInt(new SubstringView("Jan"))); + assertEquals(0x01, chain.applyAsInt(new SubstringView("Feb"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("Mar"))); + } + + public void testChain_SingleSetConstraint() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new StringSetConstraint(Set.of("One", "Two")), 0x02); + + ToIntFunction chain = builder.build(); + + // Should optimize to just a map + assertTrue(chain instanceof SubstringToIntegerMap); + + assertEquals(0x02, chain.applyAsInt(new SubstringView("One"))); + assertEquals(0x02, chain.applyAsInt(new SubstringView("Two"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("Three"))); + } + + public void testChain_SingleEqualsConstraint() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new EqualsStringConstraint("test"), 0x04); + + ToIntFunction chain = builder.build(); + + // Should optimize to just a map - equals can be represented as a map with a single entry + assertTrue(chain instanceof SubstringToIntegerMap); + + assertEquals(0x04, chain.applyAsInt(new SubstringView("test"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("other"))); + } + + public void testChain_SingleNonMapConstraint() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new LengthStringConstraint(3), 0x08); + + ToIntFunction chain = builder.build(); + + // Should not be a map or chain, just the function itself + assertFalse(chain instanceof SubstringToIntegerMap); + assertFalse(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x08, chain.applyAsInt(new SubstringView("abc"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("ab"))); + } + + public void testChain_SingleAnyStringConstraint() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new AnyString(), 0x10); + + ToIntFunction chain = builder.build(); + + // Should optimize to a simple function that always returns the bitmask + assertFalse(chain instanceof SubstringToIntegerMap); + assertFalse(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x10, chain.applyAsInt(new SubstringView("anything"))); + assertEquals(0x10, chain.applyAsInt(new SubstringView(""))); + assertEquals(0x10, chain.applyAsInt(new SubstringView("test123"))); + } + + // Multiple constraints - all map-based + + public void testChain_MultipleMapConstraints() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new StringToIntMapConstraint(Map.of("Jan", 1)), 0x01); + builder.add(new StringSetConstraint(Set.of("Mon", "Tue")), 0x02); + builder.add(new EqualsStringConstraint("test"), 0x04); + + ToIntFunction chain = builder.build(); + + // Should optimize to just a map + assertTrue(chain instanceof SubstringToIntegerMap); + + assertEquals(0x01, chain.applyAsInt(new SubstringView("Jan"))); + assertEquals(0x02, chain.applyAsInt(new SubstringView("Mon"))); + assertEquals(0x04, chain.applyAsInt(new SubstringView("test"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("other"))); + } + + public void testChain_MapConstraintsWithOverlap() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new StringToIntMapConstraint(Map.of("A", 1, "B", 2, "D", 4)), 0x01); + builder.add(new StringSetConstraint(Set.of("A", "B", "C")), 0x02); + builder.add(new EqualsStringConstraint("B"), 0x04); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToIntegerMap); + + assertEquals(0x03, chain.applyAsInt(new SubstringView("A"))); // First two bitmasks ORed together + assertEquals(0x07, chain.applyAsInt(new SubstringView("B"))); // All three bitmasks ORed together + assertEquals(0x02, chain.applyAsInt(new SubstringView("C"))); + assertEquals(0x01, chain.applyAsInt(new SubstringView("D"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("E"))); + } + + // Multiple constraints - mixed types + + public void testChain_MapAndNonMapConstraints() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new StringSetConstraint(Set.of("short")), 0x01); + builder.add(new LengthStringConstraint(5), 0x02); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x03, chain.applyAsInt(new SubstringView("short"))); // Matches both + assertEquals(0x02, chain.applyAsInt(new SubstringView("other"))); // Only length + assertEquals(0x00, chain.applyAsInt(new SubstringView("no"))); + } + + public void testChain_MapAndAnyString() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new StringSetConstraint(Set.of("special")), 0x01); + builder.add(new AnyString(), 0x02); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x03, chain.applyAsInt(new SubstringView("special"))); // Map + AnyString + assertEquals(0x02, chain.applyAsInt(new SubstringView("anything"))); // Just AnyString + assertEquals(0x02, chain.applyAsInt(new SubstringView(""))); + } + + public void testChain_MultipleNonMapConstraints() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new LengthStringConstraint(3), 0x01); + builder.add(new NotEqualsStringConstraint("foo"), 0x02); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x03, chain.applyAsInt(new SubstringView("bar"))); // Both match + assertEquals(0x01, chain.applyAsInt(new SubstringView("foo"))); // Only length matches + assertEquals(0x02, chain.applyAsInt(new SubstringView("test"))); // Only not-equals matches + assertEquals(0x02, chain.applyAsInt(new SubstringView("ab"))); + } + + // Composite constraints in chains + + public void testChain_WithOrConstraint() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + // OR constraints should be decomposed into multiple chain entries + builder.add(new OrStringConstraint(new EqualsStringConstraint("a"), new EqualsStringConstraint("b")), 0x01); + builder.add(new LengthStringConstraint(1), 0x02); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x03, chain.applyAsInt(new SubstringView("a"))); + assertEquals(0x03, chain.applyAsInt(new SubstringView("b"))); + assertEquals(0x02, chain.applyAsInt(new SubstringView("c"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("ab"))); + } + + public void testChain_WithAndConstraint() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + // AND constraints should remain as a single chain entry + builder.add(new AndStringConstraint(new LengthStringConstraint(3), new NotEqualsStringConstraint("foo")), 0x01); + builder.add(new EqualsStringConstraint("bar"), 0x02); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x03, chain.applyAsInt(new SubstringView("bar"))); // Both constraints + assertEquals(0x01, chain.applyAsInt(new SubstringView("baz"))); // Only AND constraint + assertEquals(0x00, chain.applyAsInt(new SubstringView("foo"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("ab"))); + } + + public void testChain_ComplexCompositeConstraints() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + + // First constraint: (equals "Jan" OR equals "Feb") with bitmask 0x01 + builder.add(new OrStringConstraint(new EqualsStringConstraint("Jan"), new EqualsStringConstraint("Feb")), 0x01); + + // Second constraint: (length 3 AND not equals "Mar") with bitmask 0x02 + builder.add(new AndStringConstraint(new LengthStringConstraint(3), new NotEqualsStringConstraint("Mar")), 0x02); + + // Third constraint: Set with bitmask 0x04 + builder.add(new StringSetConstraint(Set.of("Apr", "May")), 0x04); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToBitmaskChain); + + assertEquals(0x03, chain.applyAsInt(new SubstringView("Jan"))); // OR + AND + assertEquals(0x03, chain.applyAsInt(new SubstringView("Feb"))); // OR + AND + assertEquals(0x00, chain.applyAsInt(new SubstringView("Mar"))); // Excluded by AND + assertEquals(0x06, chain.applyAsInt(new SubstringView("Apr"))); // Only Set + assertEquals(0x02, chain.applyAsInt(new SubstringView("Jun"))); // Only AND + assertEquals(0x00, chain.applyAsInt(new SubstringView("December"))); + } + + // Multiple bitmasks for same constraint + + public void testChain_SameConstraintDifferentBitmasks() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new EqualsStringConstraint("test"), 0x01); + builder.add(new EqualsStringConstraint("test"), 0x02); + builder.add(new EqualsStringConstraint("test"), 0x04); + + ToIntFunction chain = builder.build(); + + // Should be optimized to a map with ORed bitmask + assertTrue(chain instanceof SubstringToIntegerMap); + + assertEquals(0x07, chain.applyAsInt(new SubstringView("test"))); + assertEquals(0x00, chain.applyAsInt(new SubstringView("other"))); + } + + // Real-world scenarios + + public void testChain_MonthNames() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + + // Full month names with one bitmask + builder.add(new StringSetConstraint(Set.of("January", "February", "March")), 0x01); + + // Abbreviated month names with another bitmask + builder.add(new StringToIntMapConstraint(Map.of("Jan", 1, "Feb", 2, "Mar", 3)), 0x02); + + // Length constraint for any 3-letter month + builder.add(new LengthStringConstraint(3), 0x04); + + ToIntFunction chain = builder.build(); + + assertEquals(0x01, chain.applyAsInt(new SubstringView("January"))); + assertEquals(0x06, chain.applyAsInt(new SubstringView("Jan"))); // Map + Length + assertEquals(0x04, chain.applyAsInt(new SubstringView("Apr"))); // Just length + assertEquals(0x00, chain.applyAsInt(new SubstringView("December"))); + } + + public void testChain_IPv4Octets() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + + // Common values as a set + builder.add(new StringSetConstraint(Set.of("0", "1", "127", "255")), 0x01); + + // Any 3-digit value + builder.add(new LengthStringConstraint(3), 0x02); + + // Not certain invalid values + builder.add(new NotEqualsStringConstraint("256"), 0x04); + + ToIntFunction chain = builder.build(); + + assertEquals(0x05, chain.applyAsInt(new SubstringView("0"))); // Set + NotEquals + assertEquals(0x07, chain.applyAsInt(new SubstringView("127"))); // All three + assertEquals(0x06, chain.applyAsInt(new SubstringView("192"))); // Length + NotEquals + assertEquals(0x02, chain.applyAsInt(new SubstringView("256"))); // Length only + } + + public void testChain_EmptyBuilder() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + assertNull(builder.build()); + } + + public void testChain_OnlyAnyString() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new AnyString(), 0xFF); + + ToIntFunction chain = builder.build(); + + // Should be optimized to a simple function + assertFalse(chain instanceof SubstringToBitmaskChain); + assertFalse(chain instanceof SubstringToIntegerMap); + + assertEquals(0xFF, chain.applyAsInt(new SubstringView("anything"))); + assertEquals(0xFF, chain.applyAsInt(new SubstringView(""))); + } + + public void testChain_MultipleAnyStrings() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + builder.add(new AnyString(), 0x01); + builder.add(new AnyString(), 0x02); + builder.add(new AnyString(), 0x04); + + ToIntFunction chain = builder.build(); + + // All AnyString bitmasks should be ORed together + assertEquals(0x07, chain.applyAsInt(new SubstringView("anything"))); + assertEquals(0x07, chain.applyAsInt(new SubstringView(""))); + } + + public void testChain_LargeNumberOfConstraints() { + SubstringToBitmaskChain.Builder builder = SubstringToBitmaskChain.builder(); + + // Add many different constraints + for (int i = 0; i < 10; i++) { + builder.add(new EqualsStringConstraint("val" + i), 1 << i); + } + builder.add(new LengthStringConstraint(4), 1 << 10); + builder.add(new NotEqualsStringConstraint("none"), 1 << 11); + + ToIntFunction chain = builder.build(); + + assertTrue(chain instanceof SubstringToBitmaskChain); + + assertEquals((1) | (1 << 10) | (1 << 11), chain.applyAsInt(new SubstringView("val0"))); + assertEquals((1 << 5) | (1 << 10) | (1 << 11), chain.applyAsInt(new SubstringView("val5"))); + assertEquals((1 << 10) | (1 << 11), chain.applyAsInt(new SubstringView("test"))); + assertEquals(1 << 11, chain.applyAsInt(new SubstringView("testing"))); + assertEquals(1 << 10, chain.applyAsInt(new SubstringView("none"))); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskFunctionFactoryTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskFunctionFactoryTests.java new file mode 100644 index 0000000000000..8d8d7f2e4d699 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SubstringToBitmaskFunctionFactoryTests.java @@ -0,0 +1,253 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.compiler; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser.SubstringView; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AndStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.AnyString; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.EqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.LengthStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.NotEqualsStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.OrStringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringSetConstraint; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints.StringToIntMapConstraint; + +import java.util.Map; +import java.util.Set; +import java.util.function.ToIntFunction; + +public class SubstringToBitmaskFunctionFactoryTests extends ESTestCase { + + // Simple constraint tests + + public void testEqualsConstraint() { + StringConstraint constraint = new EqualsStringConstraint("test"); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x01, constraint); + + assertEquals(0x01, function.applyAsInt(new SubstringView("test"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("other"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("Test"))); + assertEquals(0x00, function.applyAsInt(new SubstringView(""))); + } + + public void testNotEqualsConstraint() { + StringConstraint constraint = new NotEqualsStringConstraint("test"); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x02, constraint); + + assertEquals(0x00, function.applyAsInt(new SubstringView("test"))); + assertEquals(0x02, function.applyAsInt(new SubstringView("other"))); + assertEquals(0x02, function.applyAsInt(new SubstringView("Test"))); + assertEquals(0x02, function.applyAsInt(new SubstringView(""))); + } + + public void testLengthConstraint() { + StringConstraint constraint = new LengthStringConstraint(3); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x04, constraint); + + assertEquals(0x04, function.applyAsInt(new SubstringView("abc"))); + assertEquals(0x04, function.applyAsInt(new SubstringView("123"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("ab"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("abcd"))); + assertEquals(0x00, function.applyAsInt(new SubstringView(""))); + } + + public void testAnyStringConstraint() { + StringConstraint constraint = new AnyString(); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x08, constraint); + + assertEquals(0x08, function.applyAsInt(new SubstringView("anything"))); + assertEquals(0x08, function.applyAsInt(new SubstringView(""))); + assertEquals(0x08, function.applyAsInt(new SubstringView("123"))); + assertEquals(0x08, function.applyAsInt(new SubstringView("!@#$%"))); + } + + public void testStringSetConstraint() { + StringConstraint constraint = new StringSetConstraint(Set.of("One", "Two", "Three")); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x10, constraint); + + assertEquals(0x10, function.applyAsInt(new SubstringView("One"))); + assertEquals(0x10, function.applyAsInt(new SubstringView("Two"))); + assertEquals(0x10, function.applyAsInt(new SubstringView("Three"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("Four"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("one"))); + } + + public void testStringToIntMapConstraint() { + StringConstraint constraint = new StringToIntMapConstraint(Map.of("Jan", 1, "Feb", 2, "Mar", 3)); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x20, constraint); + + assertEquals(0x20, function.applyAsInt(new SubstringView("Jan"))); + assertEquals(0x20, function.applyAsInt(new SubstringView("Feb"))); + assertEquals(0x20, function.applyAsInt(new SubstringView("Mar"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("Apr"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("jan"))); + } + + // Composite constraint tests - OR + + public void testOrConstraint_Simple() { + StringConstraint constraint1 = new EqualsStringConstraint("test"); + StringConstraint constraint2 = new EqualsStringConstraint("Test"); + StringConstraint orConstraint = new OrStringConstraint(constraint1, constraint2); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x40, orConstraint); + + assertEquals(0x40, function.applyAsInt(new SubstringView("test"))); + assertEquals(0x40, function.applyAsInt(new SubstringView("Test"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("TEST"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("other"))); + } + + public void testOrConstraint_WithLength() { + StringConstraint constraint1 = new EqualsStringConstraint("a"); + StringConstraint constraint2 = new LengthStringConstraint(3); + StringConstraint orConstraint = new OrStringConstraint(constraint1, constraint2); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x80, orConstraint); + + assertEquals(0x80, function.applyAsInt(new SubstringView("a"))); + assertEquals(0x80, function.applyAsInt(new SubstringView("abc"))); + assertEquals(0x80, function.applyAsInt(new SubstringView("xyz"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("ab"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("abcd"))); + } + + public void testOrConstraint_MultipleOrs() { + // (equals "a") OR (equals "b") OR (equals "c") + StringConstraint constraint1 = new EqualsStringConstraint("a"); + StringConstraint constraint2 = new EqualsStringConstraint("b"); + StringConstraint constraint3 = new EqualsStringConstraint("c"); + StringConstraint orConstraint = new OrStringConstraint(new OrStringConstraint(constraint1, constraint2), constraint3); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x100, orConstraint); + + assertEquals(0x100, function.applyAsInt(new SubstringView("a"))); + assertEquals(0x100, function.applyAsInt(new SubstringView("b"))); + assertEquals(0x100, function.applyAsInt(new SubstringView("c"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("d"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("ab"))); + } + + // Composite constraint tests - AND + + public void testAndConstraint_Simple() { + StringConstraint constraint1 = new NotEqualsStringConstraint("test"); + StringConstraint constraint2 = new LengthStringConstraint(4); + StringConstraint andConstraint = new AndStringConstraint(constraint1, constraint2); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x200, andConstraint); + + assertEquals(0x00, function.applyAsInt(new SubstringView("test"))); // fails constraint1 + assertEquals(0x200, function.applyAsInt(new SubstringView("pass"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("abc"))); // fails constraint2 + assertEquals(0x00, function.applyAsInt(new SubstringView("abcde"))); // fails constraint2 + } + + public void testAndConstraint_MultipleAnds() { + // (length 4) AND (not equals "test") AND (not equals "pass") + StringConstraint constraint1 = new LengthStringConstraint(4); + StringConstraint constraint2 = new NotEqualsStringConstraint("test"); + StringConstraint constraint3 = new NotEqualsStringConstraint("pass"); + StringConstraint andConstraint = new AndStringConstraint(new AndStringConstraint(constraint1, constraint2), constraint3); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x400, andConstraint); + + assertEquals(0x400, function.applyAsInt(new SubstringView("okay"))); + assertEquals(0x400, function.applyAsInt(new SubstringView("good"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("test"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("pass"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("abc"))); + } + + // Complex composite constraints - mixing AND and OR + + public void testAndOrConstraint_Simple() { + // (equals "a" OR equals "b") AND (length 1) + StringConstraint equals_a = new EqualsStringConstraint("a"); + StringConstraint equals_b = new EqualsStringConstraint("b"); + StringConstraint length_1 = new LengthStringConstraint(1); + StringConstraint orConstraint = new OrStringConstraint(equals_a, equals_b); + StringConstraint andConstraint = new AndStringConstraint(orConstraint, length_1); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x800, andConstraint); + + assertEquals(0x800, function.applyAsInt(new SubstringView("a"))); + assertEquals(0x800, function.applyAsInt(new SubstringView("b"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("c"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("ab"))); + } + + public void testAndOrConstraint_Complex() { + // ((equals "Jan" OR equals "Feb") AND (length 3)) OR (equals "March") + StringConstraint equals_jan = new EqualsStringConstraint("Jan"); + StringConstraint equals_feb = new EqualsStringConstraint("Feb"); + StringConstraint equals_march = new EqualsStringConstraint("March"); + StringConstraint length_3 = new LengthStringConstraint(3); + + StringConstraint orJanFeb = new OrStringConstraint(equals_jan, equals_feb); + StringConstraint andWithLength = new AndStringConstraint(orJanFeb, length_3); + StringConstraint finalOr = new OrStringConstraint(andWithLength, equals_march); + + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x1000, finalOr); + + assertEquals(0x1000, function.applyAsInt(new SubstringView("Jan"))); + assertEquals(0x1000, function.applyAsInt(new SubstringView("Feb"))); + assertEquals(0x1000, function.applyAsInt(new SubstringView("March"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("Apr"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("January"))); + } + + public void testDeeplyNestedConstraints() { + // (((equals "a") OR (equals "b")) AND ((not equals "x") OR (not equals "y"))) AND (length 1) + StringConstraint equals_a = new EqualsStringConstraint("a"); + StringConstraint equals_b = new EqualsStringConstraint("b"); + StringConstraint not_x = new NotEqualsStringConstraint("x"); + StringConstraint not_y = new NotEqualsStringConstraint("y"); + StringConstraint length_1 = new LengthStringConstraint(1); + + StringConstraint orAB = new OrStringConstraint(equals_a, equals_b); + StringConstraint orNotXY = new OrStringConstraint(not_x, not_y); + StringConstraint andFirst = new AndStringConstraint(orAB, orNotXY); + StringConstraint finalAnd = new AndStringConstraint(andFirst, length_1); + + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x2000, finalAnd); + + assertEquals(0x2000, function.applyAsInt(new SubstringView("a"))); + assertEquals(0x2000, function.applyAsInt(new SubstringView("b"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("c"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("x"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("y"))); + assertEquals(0x00, function.applyAsInt(new SubstringView("ab"))); + } + + // Edge cases + + public void testEmptyString() { + StringConstraint lengthZero = new LengthStringConstraint(0); + ToIntFunction function = SubstringToBitmaskFunctionFactory.from(0x4000, lengthZero); + + assertEquals(0x4000, function.applyAsInt(new SubstringView(""))); + assertEquals(0x00, function.applyAsInt(new SubstringView("a"))); + } + + public void testMultipleBitmasks() { + StringConstraint constraint = new EqualsStringConstraint("test"); + + // Test with different bitmask values + ToIntFunction function1 = SubstringToBitmaskFunctionFactory.from(0x01, constraint); + ToIntFunction function2 = SubstringToBitmaskFunctionFactory.from(0xFF, constraint); + ToIntFunction function3 = SubstringToBitmaskFunctionFactory.from(0x80000000, constraint); + + SubstringView test = new SubstringView("test"); + SubstringView other = new SubstringView("other"); + + assertEquals(0x01, function1.applyAsInt(test)); + assertEquals(0xFF, function2.applyAsInt(test)); + assertEquals(0x80000000, function3.applyAsInt(test)); + + assertEquals(0x00, function1.applyAsInt(other)); + assertEquals(0x00, function2.applyAsInt(other)); + assertEquals(0x00, function3.applyAsInt(other)); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/BitmaskRegistryTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/BitmaskRegistryTests.java new file mode 100644 index 0000000000000..d9b30b3902e42 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/BitmaskRegistryTests.java @@ -0,0 +1,108 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.TimestampComponentType; + +public class BitmaskRegistryTests extends ESTestCase { + + private BitmaskRegistry commonRegistry; + private SubTokenType type1, type2, type3; + + @Override + public void setUp() throws Exception { + super.setUp(); + commonRegistry = new BitmaskRegistry<>(); + type1 = new SubTokenType("Type1", null, new int[] { 4, 8, 16 }, TimestampComponentType.NA); + type2 = new SubTokenType("Type2", null, new int[] { 32, 64, 128 }, TimestampComponentType.NA); + type3 = new SubTokenType("Type3", null, new int[] { 256, 512, 1024 }, TimestampComponentType.NA); + + commonRegistry.register(type1); + commonRegistry.register(type2); + commonRegistry.register(type3); + commonRegistry.seal(); + } + + public void testGetBitmask() { + assertEquals(1, commonRegistry.getBitmask(type1)); + assertEquals(2, commonRegistry.getBitmask(type2)); + assertEquals(4, commonRegistry.getBitmask(type3)); + } + + public void testGetBitIndex() { + assertEquals(0, commonRegistry.getBitIndex(type1)); + assertEquals(1, commonRegistry.getBitIndex(type2)); + assertEquals(2, commonRegistry.getBitIndex(type3)); + } + + public void testGetTypeByBitIndex() { + assertEquals(type1, commonRegistry.getTypeByBitIndex(0)); + assertEquals(type2, commonRegistry.getTypeByBitIndex(1)); + assertEquals(type3, commonRegistry.getTypeByBitIndex(2)); + } + + public void testGetLeftmostBitIndex() { + assertEquals(0, BitmaskRegistry.getLeftmostBitIndex(1)); + assertEquals(1, BitmaskRegistry.getLeftmostBitIndex(2)); + assertEquals(1, BitmaskRegistry.getLeftmostBitIndex(3)); + assertEquals(2, BitmaskRegistry.getLeftmostBitIndex(4)); + assertEquals(2, BitmaskRegistry.getLeftmostBitIndex(5)); + assertEquals(2, BitmaskRegistry.getLeftmostBitIndex(6)); + assertEquals(2, BitmaskRegistry.getLeftmostBitIndex(7)); + assertEquals(3, BitmaskRegistry.getLeftmostBitIndex(8)); + } + + public void testGetHighestPriorityType() { + assertEquals(type1, commonRegistry.getHighestPriorityType(1)); + assertEquals(type2, commonRegistry.getHighestPriorityType(2)); + assertEquals(type2, commonRegistry.getHighestPriorityType(3)); + assertEquals(type3, commonRegistry.getHighestPriorityType(4)); + assertEquals(type3, commonRegistry.getHighestPriorityType(5)); + assertEquals(type3, commonRegistry.getHighestPriorityType(6)); + assertEquals(type3, commonRegistry.getHighestPriorityType(7)); + assertNull(commonRegistry.getHighestPriorityType(8)); + } + + public void testGetCombinedBitmask() { + assertEquals(7, commonRegistry.getCombinedBitmask()); // 1 | 2 | 4 = 7 + } + + public void testGetHigherLevelBitmaskByPosition() { + int result = commonRegistry.getHigherLevelBitmaskByPosition(3, 1); // 3 = 0b11 + assertEquals(72, result); // 8 | 64 = 72 + + result = commonRegistry.getHigherLevelBitmaskByPosition(5, 2); // 5 = 0b101 + assertEquals(1040, result); // 16 | 1024 = 1040 + } + + public void testRegisterSingleInstance() { + BitmaskRegistry registry = new BitmaskRegistry<>(); + SubTokenType type = new SubTokenType("Type", null, new int[] { 1, 2, 4 }, TimestampComponentType.NA); + int bitmask = registry.register(type); + assertEquals(1, bitmask); + } + + public void testSealingPreventsRegistration() { + BitmaskRegistry registry = new BitmaskRegistry<>(); + SubTokenType type = new SubTokenType("Type", null, new int[] { 1, 2, 4 }, TimestampComponentType.NA); + registry.register(type); + registry.seal(); + SubTokenType newType = new SubTokenType("NewType", null, new int[] { 8, 16, 32 }, TimestampComponentType.NA); + assertThrows(IllegalStateException.class, () -> registry.register(newType)); + } + + public void testRegisterMaximumInstances() { + BitmaskRegistry registry = new BitmaskRegistry<>(); + for (int i = 0; i < 32; i++) { + registry.register(new SubTokenType("Type" + i, null, new int[] { 1, 2, 4 }, TimestampComponentType.NA)); + } + SubTokenType type33 = new SubTokenType("Type33", null, new int[] { 8, 16, 32 }, TimestampComponentType.NA); + assertThrows(IllegalStateException.class, () -> registry.register(type33)); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharParserTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharParserTests.java new file mode 100644 index 0000000000000..6bfcb38a2177c --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/CharParserTests.java @@ -0,0 +1,32 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.test.ESTestCase; + +public class CharParserTests extends ESTestCase { + + public void testFindBitmaskForInteger() { + int[] integerSubTokenBitmaskArrayRanges = { 10, 20, 30, Integer.MAX_VALUE }; + int[] integerSubTokenBitmasks = { 1, 2, 3, 0 }; + + assertEquals(1, CharParser.findBitmaskForInteger(-10, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(1, CharParser.findBitmaskForInteger(Integer.MIN_VALUE, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(1, CharParser.findBitmaskForInteger(0, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(1, CharParser.findBitmaskForInteger(9, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(1, CharParser.findBitmaskForInteger(10, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(2, CharParser.findBitmaskForInteger(11, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(2, CharParser.findBitmaskForInteger(19, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(2, CharParser.findBitmaskForInteger(20, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(3, CharParser.findBitmaskForInteger(21, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(3, CharParser.findBitmaskForInteger(29, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(3, CharParser.findBitmaskForInteger(30, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(0, CharParser.findBitmaskForInteger(31, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + assertEquals(0, CharParser.findBitmaskForInteger(Integer.MAX_VALUE, integerSubTokenBitmaskArrayRanges, integerSubTokenBitmasks)); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/RegexParserAndTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/RegexParserAndTests.java new file mode 100644 index 0000000000000..104ac3eafdee2 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/RegexParserAndTests.java @@ -0,0 +1,175 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Argument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IPv4Argument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IntegerArgument; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParseException; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Timestamp; + +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.hamcrest.Matchers.instanceOf; + +public class RegexParserAndTests extends ESTestCase implements Parser { + + private static final Pattern IPV4_PATTERN = Pattern.compile("\\b(\\d{1,3}(?:\\.\\d{1,3}){3})\\b"); + private static final Pattern INTEGER_PATTERN = Pattern.compile("\\b\\d+\\b"); + + // New timestamp pattern and format + private static final Pattern TIMESTAMP_1_PATTERN = Pattern.compile("\\b\\d{2}/[A-Za-z]{3}/\\d{4}:\\d{2}:\\d{2}:\\d{2} [+-]\\d{4}\\b"); + private static final String TIMESTAMP_1_FORMAT = "dd/MMM/yyyy:HH:mm:ss Z"; + private static final ThreadLocal TIMESTAMP_1_FORMATTER = ThreadLocal.withInitial( + () -> new SimpleDateFormat(TIMESTAMP_1_FORMAT, Locale.ENGLISH) + ); + + // Existing timestamp pattern and format + private static final Pattern TIMESTAMP_2_PATTERN = Pattern.compile( + "\\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \\d{2}, \\d{4} \\d{2}:\\d{2}:\\d{2} (?:AM|PM)\\b" + ); + private static final String TIMESTAMP_2_FORMAT = "MMM dd, yyyy hh:mm:ss a"; + private static final ThreadLocal TIMESTAMP_2_FORMATTER = ThreadLocal.withInitial( + () -> new SimpleDateFormat(TIMESTAMP_2_FORMAT, Locale.ENGLISH) + ); + + public void testTimestampFormat1() throws ParseException { + RegexParserAndTests parser = new RegexParserAndTests(); + + String message = "Oct 05, 2023 02:48:00 PM INFO Response from 127.0.0.1 took 2000 ms"; + List> arguments = parser.parse(message); + StringBuilder patternBuilder = new StringBuilder(); + Parser.constructPattern(message, arguments, patternBuilder, true); + + assertEquals("%T INFO Response from %4 took %I ms", patternBuilder.toString()); + assertEquals(3, arguments.size()); + assertThat(arguments.get(0), instanceOf(Timestamp.class)); + assertEquals("IPV4", arguments.get(1).type().name()); + assertEquals("INTEGER", arguments.get(2).type().name()); + } + + public void testTimestampFormat2() throws ParseException { + RegexParserAndTests parser = new RegexParserAndTests(); + + String message = "05/Oct/2023:14:48:00 +0200 GET /index.html 127.0.0.1 200"; + List> arguments = parser.parse(message); + StringBuilder patternBuilder = new StringBuilder(); + Parser.constructPattern(message, arguments, patternBuilder, true); + + assertEquals("%T GET /index.html %4 %I", patternBuilder.toString()); + assertEquals(3, arguments.size()); + assertThat(arguments.get(0), instanceOf(Timestamp.class)); + assertEquals("IPV4", arguments.get(1).type().name()); + assertEquals("INTEGER", arguments.get(2).type().name()); + } + + /** + * Checks if a position range overlaps with any existing argument in the list + * @param arguments List of existing arguments + * @param startPos Start position of the range to check + * @param length Length of the range to check + * @return true if there is an overlap, false otherwise + */ + private boolean isOverlappingWithExistingArguments(List> arguments, int startPos, int length) { + int endPos = startPos + length; + for (Argument arg : arguments) { + int argStart = arg.startPosition(); + int argEnd = argStart + arg.length(); + + // Check if ranges overlap + if ((startPos <= argEnd) && (endPos >= argStart)) { + return true; + } + } + return false; + } + + @Override + public List> parse(String rawMessage) throws ParseException { + if (rawMessage == null || rawMessage.isEmpty()) { + throw new IllegalArgumentException("rawMessage cannot be null or empty"); + } + + List> arguments = new ArrayList<>(); + + // 1. Find and extract timestamp substring (prefer TIMESTAMP_1, then TIMESTAMP_2) + int tsStart = -1, tsEnd = -1; + String tsString = null; + SimpleDateFormat usedFormatter = null; + + Matcher ts1Matcher = TIMESTAMP_1_PATTERN.matcher(rawMessage); + if (ts1Matcher.find()) { + tsString = ts1Matcher.group(); + tsStart = ts1Matcher.start(); + tsEnd = ts1Matcher.end(); + usedFormatter = TIMESTAMP_1_FORMATTER.get(); + } else { + Matcher ts2Matcher = TIMESTAMP_2_PATTERN.matcher(rawMessage); + if (ts2Matcher.find()) { + tsString = ts2Matcher.group(); + tsStart = ts2Matcher.start(); + tsEnd = ts2Matcher.end(); + usedFormatter = TIMESTAMP_2_FORMATTER.get(); + } + } + + if (tsString != null) { + try { + Date date = usedFormatter.parse(tsString); + arguments.add(new Timestamp(tsStart, tsEnd - tsStart, date.getTime(), usedFormatter.toPattern())); + } catch (java.text.ParseException e) { + throw new ParseException("Failed to parse timestamp: " + tsString, e); + } + } + + // 2. Process the rest of the message for IP addresses and integers + String remaining = tsEnd >= 0 ? rawMessage.substring(tsEnd) : rawMessage; + + // Find IP addresses + Matcher ipMatcher = IPV4_PATTERN.matcher(remaining); + while (ipMatcher.find()) { + String ipStr = ipMatcher.group(); + int startPos = tsEnd + ipMatcher.start(); + int length = ipMatcher.end() - ipMatcher.start(); + + // Only add if not overlapping with existing arguments + if (isOverlappingWithExistingArguments(arguments, startPos, length) == false) { + String[] octets = ipStr.split("\\."); + int[] octetValues = new int[4]; + for (int j = 0; j < 4; j++) { + octetValues[j] = Integer.parseInt(octets[j]); + } + arguments.add(new IPv4Argument(startPos, length, octetValues, 0)); + } + } + + // Find integers + Matcher intMatcher = INTEGER_PATTERN.matcher(remaining); + while (intMatcher.find()) { + String intStr = intMatcher.group(); + int startPos = tsEnd + intMatcher.start(); + int length = intMatcher.end() - intMatcher.start(); + + // Only add if not overlapping with existing arguments + if (isOverlappingWithExistingArguments(arguments, startPos, length) == false) { + int value = Integer.parseInt(intStr); + arguments.add(new IntegerArgument(startPos, length, value)); + } + } + + return arguments; + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/StringToIntMapper_Experimental.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/StringToIntMapper_Experimental.java new file mode 100644 index 0000000000000..1e1c808636009 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/StringToIntMapper_Experimental.java @@ -0,0 +1,144 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import java.util.Set; +import java.util.function.ToIntFunction; + +/** + * A {@link ToIntFunction} that relies on input arrays of string keys and numeric values. + * If the input {@link SubstringView} matches any key, the corresponding value is returned; otherwise - the function returns -1. + * If the input value array is null, the function will act as a set membership test, returning 1 for any matching key and -1 for + * non-matching keys. + * It is optimized for fast lookups by using a hash table that ensures no collisions within the initial set of strings (thus no buckets + * to search through) and fast array access. + * In addition, it avoids allocation of new objects during lookups by using {@link SubstringView} as input as well as + * the overhead related to boxing and unboxing of integers. + * It does not attempt to be minimal (e.g., by relying on perfect hashing), but rather to be sparse, as we want to reduce the chance + * of collisions on lookups as well, because collisions require equality checks. + */ +public final class StringToIntMapper_Experimental implements ToIntFunction { + + public static final int MAX_SIZE = 128; + + private final SubstringView[] hashSet1; + private final SubstringView[] hashSet2; + private final int[] values1; + private final int[] values2; + private final boolean isSet; + private final int hashFactor; + + public StringToIntMapper_Experimental(Set keys) { + this(keys.toArray(new String[0]), null); + } + + public StringToIntMapper_Experimental(final String[] keys, int[] values) { + int setSize = keys.length * 2; + SubstringView[] tmpHashSet1 = null; + SubstringView[] tmpHashSet2 = null; + int[] tmpValues1 = null; + int[] tmpValues2 = null; + isSet = values == null; + + // First attempt: try with a single array (no collisions allowed) + boolean singleArrayFailed = false; + while (tmpHashSet1 == null && singleArrayFailed == false) { + setSize += 23; + if (setSize > MAX_SIZE) { + // We couldn't fit the keys in a single array within the max size limit + singleArrayFailed = true; + } else { + tmpHashSet1 = new SubstringView[setSize]; + tmpValues1 = isSet ? null : new int[setSize]; + for (int i = 0; i < keys.length; i++) { + SubstringView substring = new SubstringView(keys[i]); + int hash = internalHash(substring, setSize); + if (tmpHashSet1[hash] == null) { + tmpHashSet1[hash] = substring; + if (isSet == false) tmpValues1[hash] = values[i]; + } else { + // Collision detected, can't use a single array + tmpHashSet1 = null; + break; + } + } + } + } + + // Second attempt: try with two arrays (one collision per hash allowed) + if (singleArrayFailed) { + setSize = keys.length; + while (tmpHashSet1 == null) { + setSize += 23; + if (setSize > MAX_SIZE) { + throw new IllegalArgumentException( + "Cannot generate a hash table for the provided set with at most one collision per " + + "slot without exceeding the maximum size of " + + MAX_SIZE + ); + } + + tmpHashSet1 = new SubstringView[setSize]; + tmpHashSet2 = new SubstringView[setSize]; + tmpValues1 = isSet ? null : new int[setSize]; + tmpValues2 = isSet ? null : new int[setSize]; + + for (int i = 0; i < keys.length; i++) { + SubstringView substring = new SubstringView(keys[i]); + int hash = internalHash(substring, setSize); + if (tmpHashSet1[hash] == null) { + tmpHashSet1[hash] = substring; + if (isSet == false) tmpValues1[hash] = values[i]; + } else if (tmpHashSet2[hash] == null) { + tmpHashSet2[hash] = substring; + if (isSet == false) tmpValues2[hash] = values[i]; + } else { + // More than one collision for this hash slot, need to increase table size + tmpHashSet1 = null; + tmpHashSet2 = null; + break; + } + } + } + } + + this.hashSet1 = tmpHashSet1; + this.hashSet2 = tmpHashSet2; + this.values1 = tmpValues1; + this.values2 = tmpValues2; + this.hashFactor = this.hashSet1.length; + } + + /** + * Calculates the hash code for the given SubstringView and returns its index in the hash table. Insertions and lookups must use the + * exact same hash logic to ensure consistency. + * @param str the SubstringView to calculate the hash for + * @param hashFactor the size of the hash table to use for the hash calculation + * @return the index in the hash table for the given SubstringView + */ + private int internalHash(final SubstringView str, final int hashFactor) { + // calculating hash, then ensuring it is non-negative and finally taking modulo to fit into the hash table size + return (str.hashCode() & 0x7FFFFFFF) % hashFactor; + } + + @Override + public int applyAsInt(final SubstringView input) { + int hash = internalHash(input, hashFactor); + SubstringView match1 = hashSet1[hash]; + if (match1 != null && match1.equals(input)) { + return isSet ? 1 : values1[hash]; + } + if (hashSet2 != null) { + SubstringView match2 = hashSet2[hash]; + if (match2 != null && match2.equals(input)) { + return isSet ? 1 : values2[hash]; + } + } + return -1; + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringToIntegerMapTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringToIntegerMapTests.java new file mode 100644 index 0000000000000..01936e3b0bd43 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/parser/SubstringToIntegerMapTests.java @@ -0,0 +1,57 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.parser; + +import org.elasticsearch.test.ESTestCase; + +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class SubstringToIntegerMapTests extends ESTestCase { + + public static final String[] MONTHS = new String[] { + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec" }; + + public static final SubstringToIntegerMap MONTH_MAP = SubstringToIntegerMap.builder() + .addAll(IntStream.range(0, MONTHS.length).boxed().collect(Collectors.toMap(i -> MONTHS[i], i -> i + 1))) + .build(); + + // same as the above but for a month Map with StringMapToIntMapper + public void testMap() { + String testString = "I love the month of July!"; + SubstringView input = new SubstringView(testString); + assertEquals("Value should be 0 for " + input + " in " + testString, 0, MONTH_MAP.applyAsInt(input)); + int indexOfJul = testString.indexOf("Jul"); + input.set(testString, indexOfJul, indexOfJul + 3); + assertEquals("Value should be 7 for " + input + " in " + testString, 7, MONTH_MAP.applyAsInt(input)); + } + + public void testAllTrueValues() { + String testString = "I love the months Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec!"; + SubstringView input = new SubstringView(testString); + for (int i = 0; i < MONTHS.length; i++) { + String month = MONTHS[i]; + int startIndex = testString.indexOf(month); + input.set(startIndex, startIndex + month.length()); + assertEquals("Value should match for " + input + " in " + testString, i + 1, MONTH_MAP.applyAsInt(input)); + } + input.set(1, 5); + assertEquals("Value should be 0 for non-matching substring " + input + " in " + testString, 0, MONTH_MAP.applyAsInt(input)); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/PatternUtilsParseMultiTokenFormatTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/PatternUtilsParseMultiTokenFormatTests.java new file mode 100644 index 0000000000000..97e309d4a9748 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/PatternUtilsParseMultiTokenFormatTests.java @@ -0,0 +1,217 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import static org.hamcrest.Matchers.containsString; + +public class PatternUtilsParseMultiTokenFormatTests extends ESTestCase { + + private List tokenTypes; + private Set boundaryChars; + + @Override + public void setUp() throws Exception { + super.setUp(); + this.tokenTypes = createTestTokenTypes(); + this.boundaryChars = Schema.getInstance().getAllTokenBoundaryChars(); + } + + private List createTestTokenTypes() { + List tokenTypes = new ArrayList<>(); + + // Create mock SubTokenType arrays for TokenFormat + SubTokenType[] mockSubTokens = new SubTokenType[1]; + mockSubTokens[0] = createMockSubTokenType("mockSubToken"); + + TokenFormat mockFormat = new TokenFormat("$mockSubToken", new char[0], mockSubTokens); + + tokenTypes.add(new TokenType("time", EncodingType.TIMESTAMP, mockFormat, "Time token")); + tokenTypes.add(new TokenType("Mon", EncodingType.TEXT, mockFormat, "Month token")); + tokenTypes.add(new TokenType("DD", EncodingType.INTEGER, mockFormat, "Day token")); + tokenTypes.add(new TokenType("YYYY", EncodingType.INTEGER, mockFormat, "Year token")); + tokenTypes.add(new TokenType("datetime", EncodingType.TIMESTAMP, mockFormat, "DateTime token")); + tokenTypes.add(new TokenType("TZA", EncodingType.TEXT, mockFormat, "Timezone token")); + tokenTypes.add(new TokenType("ip", EncodingType.IPV4, mockFormat, "IP address token")); + tokenTypes.add(new TokenType("level", EncodingType.TEXT, mockFormat, "Log level token")); + + return tokenTypes; + } + + @SuppressWarnings("SameParameterValue") + private SubTokenType createMockSubTokenType(String name) { + SubTokenBaseType mockBaseType = new SubTokenBaseType( + "mockBase", + EncodingType.TEXT, + "M", + String.class, + "Mock base type", + new char[] { 'a', 'b', 'c' } + ); + return new SubTokenType(name, mockBaseType, "", "Mock sub token"); + } + + public void testParseMultiTokenFormat_SingleToken() { + List delimiterParts = new ArrayList<>(); + List tokens = new ArrayList<>(); + PatternUtils.parseMultiTokenFormat("$time", tokenTypes, boundaryChars, tokens, delimiterParts); + + assertEquals(1, tokens.size()); + assertEquals(0, delimiterParts.size()); + assertEquals("time", tokens.getFirst().name()); + } + + public void testParseMultiTokenFormat_MultipleTokensWithSpaces() { + List delimiterParts = new ArrayList<>(); + List tokens = new ArrayList<>(); + PatternUtils.parseMultiTokenFormat("$Mon $DD $YYYY", tokenTypes, boundaryChars, tokens, delimiterParts); + + assertEquals(3, tokens.size()); + assertEquals(2, delimiterParts.size()); + assertEquals("Mon", tokens.getFirst().name()); + assertEquals(" ", delimiterParts.getFirst()); + assertEquals("DD", tokens.get(1).name()); + assertEquals(" ", delimiterParts.get(1)); + assertEquals("YYYY", tokens.get(2).name()); + } + + public void testParseMultiTokenFormat_TokensWithCommaDelimiter() { + List delimiterParts = new ArrayList<>(); + List tokens = new ArrayList<>(); + PatternUtils.parseMultiTokenFormat("$Mon, $DD $YYYY", tokenTypes, boundaryChars, tokens, delimiterParts); + + assertEquals(3, tokens.size()); + assertEquals(2, delimiterParts.size()); + assertEquals("Mon", tokens.getFirst().name()); + assertEquals(", ", delimiterParts.getFirst()); + assertEquals("DD", tokens.get(1).name()); + assertEquals(" ", delimiterParts.get(1)); + assertEquals("YYYY", tokens.get(2).name()); + } + + public void testParseMultiTokenFormat_IllegalLiteralAtStart() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat("Date: $Mon $DD", tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertThat( + e.getMessage(), + containsString("Invalid format - only token delimiters and token boundary characters are allowed between tokens:") + ); + } + + public void testParseMultiTokenFormat_IllegalLiteralAtEnd() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat( + "$datetime $TZA (UTC)", + tokenTypes, + boundaryChars, + new ArrayList<>(), + new ArrayList<>() + ) + ); + assertThat( + e.getMessage(), + containsString("Invalid format - only token delimiters and token boundary characters are allowed between tokens:") + ); + } + + public void testParseMultiTokenFormat_LiteralTextBetweenWithoutDelimiter() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat("$ip:$level", tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertThat(e.getMessage(), containsString("Token names must be separated by delimiters:")); + } + + public void testParseMultiTokenFormat_OnlyLiteralText() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat("No tokens here", tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertThat( + e.getMessage(), + containsString("Invalid format - only token delimiters and token boundary characters are allowed between tokens:") + ); + } + + public void testParseMultiTokenFormat_EmptyString() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat("", tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertEquals("Format string cannot be null or empty", e.getMessage()); + } + + public void testParseMultiTokenFormat_NullString() { + @SuppressWarnings("DataFlowIssue") + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat(null, tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertEquals("Format string cannot be null or empty", e.getMessage()); + } + + public void testParseMultiTokenFormat_UnknownTokenType() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat("$unknown", tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertEquals("Unknown token type: unknown in format: $unknown", e.getMessage()); + } + + public void testParseMultiTokenFormat_InvalidTokenReference() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat("$ ", tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertEquals("Token name cannot be empty in format: $ ", e.getMessage()); + } + + public void testParseMultiTokenFormat_DollarAtEnd() { + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> PatternUtils.parseMultiTokenFormat("text$", tokenTypes, boundaryChars, new ArrayList<>(), new ArrayList<>()) + ); + assertEquals( + "Invalid format - only token delimiters and token boundary characters are allowed between tokens: text$", + e.getMessage() + ); + } + + public void testParseMultiTokenFormat_TokensBoundedByDifferentChars() { + List delimiterParts = new ArrayList<>(); + List tokens = new ArrayList<>(); + PatternUtils.parseMultiTokenFormat("$time;,$level", tokenTypes, boundaryChars, tokens, delimiterParts); + + assertEquals(2, tokens.size()); + assertEquals(1, delimiterParts.size()); + assertEquals("time", tokens.getFirst().name()); + assertEquals(";,", delimiterParts.getFirst()); + assertEquals("level", tokens.get(1).name()); + } + + public void testParseMultiTokenFormat_WhitespaceHandling() { + List delimiterParts = new ArrayList<>(); + List tokens = new ArrayList<>(); + PatternUtils.parseMultiTokenFormat("$time \t$level", tokenTypes, boundaryChars, tokens, delimiterParts); + + assertEquals(2, tokens.size()); + assertEquals(1, delimiterParts.size()); + assertEquals("time", tokens.getFirst().name()); + assertEquals(" \t", delimiterParts.getFirst()); + assertEquals("level", tokens.get(1).name()); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SchemaTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SchemaTests.java new file mode 100644 index 0000000000000..4438340e09e93 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/SchemaTests.java @@ -0,0 +1,62 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema; + +import org.elasticsearch.core.SuppressForbidden; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.EncodingType; + +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +@SuppressForbidden(reason = "prints out the schema for debugging purposes within test") +public class SchemaTests extends ESTestCase { + public void testReadAndPrintSchema() { + Schema schema = Schema.getInstance(); + assertNotNull(schema); + System.out.println(schema); + } + + public void testMultiTokenTypesParsing() { + Schema schema = Schema.getInstance(); + assertNotNull(schema); + + // build name->type map for easy assertions + Map nameToType = schema.getMultiTokenTypes() + .stream() + .collect(Collectors.toMap(MultiTokenType::name, t -> t)); + + // ensure expected multi-token types from schema.yaml are present + assertTrue("RFC-1123-timestamp should be present", nameToType.containsKey("RFC-1123-timestamp")); + assertTrue( + "logging-libraries-datetime-timestamp should be present", + nameToType.containsKey("logging-libraries-datetime-timestamp") + ); + + MultiTokenType t1 = nameToType.get("RFC-1123-timestamp"); + MultiTokenType t2 = nameToType.get("logging-libraries-datetime-timestamp"); + + // encoding type expected to be %T (Timestamp) + assertEquals("encodingType for RFC-1123-timestamp", EncodingType.TIMESTAMP, t1.encodingType()); + assertEquals("encodingType for logging-libraries-datetime-timestamp", EncodingType.TIMESTAMP, t2.encodingType()); + + nameToType.values().forEach(t -> { + StringBuilder formatFromParts = new StringBuilder(); + List delimiterParts = t.getFormat().getDelimiterParts(); + List tokens = t.getFormat().getTokens(); + for (int i = 0; i < tokens.size(); i++) { + formatFromParts.append('$').append(tokens.get(i).name()); + if (i < delimiterParts.size()) { + formatFromParts.append(delimiterParts.get(i)); + } + } + assertEquals(t.getFormat().getRawFormat(), formatFromParts.toString()); + }); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraintsTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraintsTests.java new file mode 100644 index 0000000000000..c2f7fd5fc476a --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/IntConstraintsTests.java @@ -0,0 +1,441 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.test.ESTestCase; + +public class IntConstraintsTests extends ESTestCase { + + public void testEqualsConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("==5"); + assertTrue(predicate.isApplicable(5)); + assertFalse(predicate.isApplicable(4)); + assertFalse(predicate.isApplicable(6)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(5, 5) }, predicate.trueRanges()); + } + + public void testLessThanConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("<10"); + assertTrue(predicate.isApplicable(9)); + assertTrue(predicate.isApplicable(-100)); + assertFalse(predicate.isApplicable(10)); + assertFalse(predicate.isApplicable(11)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, 9) }, predicate.trueRanges()); + } + + public void testGreaterThanConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint(">20"); + assertTrue(predicate.isApplicable(21)); + assertTrue(predicate.isApplicable(100)); + assertFalse(predicate.isApplicable(20)); + assertFalse(predicate.isApplicable(19)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(21, Integer.MAX_VALUE) }, predicate.trueRanges()); + } + + public void testLessThanOrEqualConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("<=15"); + assertTrue(predicate.isApplicable(15)); + assertTrue(predicate.isApplicable(14)); + assertTrue(predicate.isApplicable(-50)); + assertFalse(predicate.isApplicable(16)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, 15) }, predicate.trueRanges()); + } + + public void testGreaterThanOrEqualConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint(">=25"); + assertTrue(predicate.isApplicable(25)); + assertTrue(predicate.isApplicable(26)); + assertTrue(predicate.isApplicable(100)); + assertFalse(predicate.isApplicable(24)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(25, Integer.MAX_VALUE) }, predicate.trueRanges()); + } + + public void testNotEqualConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("!=30"); + assertTrue(predicate.isApplicable(29)); + assertTrue(predicate.isApplicable(31)); + assertFalse(predicate.isApplicable(30)); + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, 29), new IntConstraints.Range(31, Integer.MAX_VALUE) }, + predicate.trueRanges() + ); + } + + public void testRangeConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("1-10"); + assertTrue(predicate.isApplicable(1)); + assertTrue(predicate.isApplicable(5)); + assertTrue(predicate.isApplicable(10)); + assertFalse(predicate.isApplicable(0)); + assertFalse(predicate.isApplicable(11)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(1, 10) }, predicate.trueRanges()); + } + + public void testSetConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("1|3|5|7"); + assertTrue(predicate.isApplicable(1)); + assertTrue(predicate.isApplicable(3)); + assertTrue(predicate.isApplicable(5)); + assertTrue(predicate.isApplicable(7)); + assertFalse(predicate.isApplicable(2)); + assertFalse(predicate.isApplicable(4)); + assertFalse(predicate.isApplicable(8)); + assertArrayEquals( + new IntConstraints.Range[] { + new IntConstraints.Range(1, 1), + new IntConstraints.Range(3, 3), + new IntConstraints.Range(5, 5), + new IntConstraints.Range(7, 7) }, + predicate.trueRanges() + ); + } + + public void testLengthConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("{3}"); + assertTrue(predicate.isApplicable(123)); + assertTrue(predicate.isApplicable(100)); + assertTrue(predicate.isApplicable(999)); + assertFalse(predicate.isApplicable(1000)); + assertFalse(predicate.isApplicable(99)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(100, 999) }, predicate.trueRanges()); + } + + public void testAndConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint(">5 && <10"); + assertTrue(predicate.isApplicable(6)); + assertTrue(predicate.isApplicable(9)); + assertFalse(predicate.isApplicable(5)); + assertFalse(predicate.isApplicable(10)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(6, 9) }, predicate.trueRanges()); + } + + public void testOrConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("<5 || >10"); + assertTrue(predicate.isApplicable(4)); + assertTrue(predicate.isApplicable(11)); + assertTrue(predicate.isApplicable(0)); + assertTrue(predicate.isApplicable(20)); + assertFalse(predicate.isApplicable(5)); + assertFalse(predicate.isApplicable(10)); + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, 4), new IntConstraints.Range(11, Integer.MAX_VALUE) }, + predicate.trueRanges() + ); + } + + public void testComplexNestedConstraint() { + IntConstraint predicate = IntConstraints.parseIntConstraint("==5 || >=10 && <=20"); + assertTrue(predicate.isApplicable(5)); + assertTrue(predicate.isApplicable(10)); + assertTrue(predicate.isApplicable(15)); + assertTrue(predicate.isApplicable(20)); + assertFalse(predicate.isApplicable(4)); + assertFalse(predicate.isApplicable(6)); + assertFalse(predicate.isApplicable(9)); + assertFalse(predicate.isApplicable(21)); + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(5, 5), new IntConstraints.Range(10, 20) }, + predicate.trueRanges() + ); + } + + public void testIntegerBoundaries() { + // Test at Integer.MAX_VALUE and Integer.MIN_VALUE + IntConstraint maxValuePredicate = IntConstraints.parseIntConstraint("==" + Integer.MAX_VALUE); + assertTrue(maxValuePredicate.isApplicable(Integer.MAX_VALUE)); + assertFalse(maxValuePredicate.isApplicable(Integer.MAX_VALUE - 1)); + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(Integer.MAX_VALUE, Integer.MAX_VALUE) }, + maxValuePredicate.trueRanges() + ); + + IntConstraint minValuePredicate = IntConstraints.parseIntConstraint("==" + Integer.MIN_VALUE); + assertTrue(minValuePredicate.isApplicable(Integer.MIN_VALUE)); + assertFalse(minValuePredicate.isApplicable(Integer.MIN_VALUE + 1)); + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, Integer.MIN_VALUE) }, + minValuePredicate.trueRanges() + ); + } + + public void testNegativeNumbers() { + IntConstraint predicate = IntConstraints.parseIntConstraint("(-10)-(-5)"); + assertTrue(predicate.isApplicable(-10)); + assertTrue(predicate.isApplicable(-7)); + assertTrue(predicate.isApplicable(-5)); + assertFalse(predicate.isApplicable(-11)); + assertFalse(predicate.isApplicable(-4)); + assertFalse(predicate.isApplicable(0)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(-10, -5) }, predicate.trueRanges()); + } + + public void testZeroHandling() { + IntConstraint equalsZero = IntConstraints.parseIntConstraint("==0"); + assertTrue(equalsZero.isApplicable(0)); + assertFalse(equalsZero.isApplicable(1)); + assertFalse(equalsZero.isApplicable(-1)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(0, 0) }, equalsZero.trueRanges()); + + IntConstraint rangeIncludingZero = IntConstraints.parseIntConstraint("(-5)-5"); + assertTrue(rangeIncludingZero.isApplicable(0)); + assertTrue(rangeIncludingZero.isApplicable(-5)); + assertTrue(rangeIncludingZero.isApplicable(5)); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(-5, 5) }, rangeIncludingZero.trueRanges()); + } + + public void testNullOrEmptyConstraint() { + assertEquals(AnyInteger.INSTANCE, IntConstraints.parseIntConstraint(null)); + assertEquals(AnyInteger.INSTANCE, IntConstraints.parseIntConstraint("")); + assertEquals(AnyInteger.INSTANCE, IntConstraints.parseIntConstraint(" ")); + } + + public void testInvalidConstraintFormat() { + // Invalid operator + assertThrows(IllegalArgumentException.class, () -> IntConstraints.parseIntConstraint("=>5")); + + // Missing operand + assertThrows(IllegalArgumentException.class, () -> IntConstraints.parseIntConstraint("==")); + + // Invalid number format + assertThrows(IllegalArgumentException.class, () -> IntConstraints.parseIntConstraint("==abc")); + + // Incorrectly formatted range + assertThrows(IllegalArgumentException.class, () -> IntConstraints.parseIntConstraint("5-")); + assertThrows(IllegalArgumentException.class, () -> IntConstraints.parseIntConstraint("-5")); + + // Invalid range (upper bound < lower bound) + assertThrows(IllegalArgumentException.class, () -> IntConstraints.parseIntConstraint("10-5")); + } + + public void testWhitespaceHandling() { + // Test different whitespace formats + IntConstraint standard = IntConstraints.parseIntConstraint("5-10"); + IntConstraint withSpaces = IntConstraints.parseIntConstraint(" 5 - 10 "); + IntConstraint manySpaces = IntConstraints.parseIntConstraint(" 5 - 10 "); + + // All should behave the same + assertTrue(standard.isApplicable(7)); + assertTrue(withSpaces.isApplicable(7)); + assertTrue(manySpaces.isApplicable(7)); + + assertFalse(standard.isApplicable(11)); + assertFalse(withSpaces.isApplicable(11)); + assertFalse(manySpaces.isApplicable(11)); + + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(5, 10) }, standard.trueRanges()); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(5, 10) }, withSpaces.trueRanges()); + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(5, 10) }, manySpaces.trueRanges()); + } + + public void testAndWithOverlappingRanges() { + IntConstraint range1 = IntConstraints.parseIntConstraint("1-10"); + IntConstraint range2 = IntConstraints.parseIntConstraint("5-15"); + IntConstraint combined = range1.and(range2); + + // Overlapping range is 5-10 + assertTrue(combined.isApplicable(5)); + assertTrue(combined.isApplicable(10)); + assertFalse(combined.isApplicable(4)); + assertFalse(combined.isApplicable(11)); + + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(5, 10) }, combined.trueRanges()); + } + + public void testAndWithNonOverlappingRanges() { + IntConstraint range1 = IntConstraints.parseIntConstraint("1-5"); + IntConstraint range2 = IntConstraints.parseIntConstraint("10-15"); + IntConstraint combined = range1.and(range2); + + // No overlap, should accept no values + assertFalse(combined.isApplicable(3)); + assertFalse(combined.isApplicable(12)); + + assertArrayEquals(new IntConstraints.Range[] {}, combined.trueRanges()); + } + + public void testOrWithOverlappingRanges() { + IntConstraint range1 = IntConstraints.parseIntConstraint("1-10"); + IntConstraint range2 = IntConstraints.parseIntConstraint("5-15"); + IntConstraint combined = range1.or(range2); + + // Combined range is 1-15 + assertTrue(combined.isApplicable(1)); + assertTrue(combined.isApplicable(10)); + assertTrue(combined.isApplicable(15)); + assertFalse(combined.isApplicable(0)); + assertFalse(combined.isApplicable(16)); + + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(1, 15) }, combined.trueRanges()); + } + + public void testOrWithNonOverlappingRanges() { + IntConstraint range1 = IntConstraints.parseIntConstraint("1-5"); + IntConstraint range2 = IntConstraints.parseIntConstraint("10-15"); + IntConstraint combined = range1.or(range2); + + // Combined range is 1-5 and 10-15 + assertTrue(combined.isApplicable(3)); + assertTrue(combined.isApplicable(12)); + assertFalse(combined.isApplicable(6)); + assertFalse(combined.isApplicable(9)); + + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(1, 5), new IntConstraints.Range(10, 15) }, + combined.trueRanges() + ); + } + + public void testComplexAndOrCombination() { + IntConstraint range1 = IntConstraints.parseIntConstraint("1-10"); + IntConstraint range2 = IntConstraints.parseIntConstraint("5-15"); + IntConstraint range3 = IntConstraints.parseIntConstraint("20-25"); + IntConstraint range4 = IntConstraints.parseIntConstraint("8-22"); + + // (1-10 AND 5-15) OR (20-25 AND 8-22) + IntConstraint combined = range1.and(range2).or(range3.and(range4)); + + // Valid ranges: 5-10 (from AND of 1-10 and 5-15) and 20-22 (from AND of 20-25 and 8-22) + assertTrue(combined.isApplicable(5)); + assertTrue(combined.isApplicable(10)); + assertTrue(combined.isApplicable(20)); + assertTrue(combined.isApplicable(22)); + assertFalse(combined.isApplicable(4)); + assertFalse(combined.isApplicable(11)); + assertFalse(combined.isApplicable(19)); + assertFalse(combined.isApplicable(23)); + + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(5, 10), new IntConstraints.Range(20, 22) }, + combined.trueRanges() + ); + } + + public void testMultipleChainedAnd() { + IntConstraint range1 = IntConstraints.parseIntConstraint("10-30"); + IntConstraint range2 = IntConstraints.parseIntConstraint("1-20"); + IntConstraint range3 = IntConstraints.parseIntConstraint("15-25"); + + // 1-20 AND 10-30 AND 15-25 + IntConstraint combined = range1.and(range2).and(range3); + + // Valid range: 15-20 + assertTrue(combined.isApplicable(15)); + assertTrue(combined.isApplicable(20)); + assertFalse(combined.isApplicable(14)); + assertFalse(combined.isApplicable(21)); + + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(15, 20) }, combined.trueRanges()); + } + + public void testMultipleChainedOr() { + IntConstraint range1 = IntConstraints.parseIntConstraint("20-25"); + IntConstraint range2 = IntConstraints.parseIntConstraint("1-5"); + IntConstraint range3 = IntConstraints.parseIntConstraint("10-15"); + + // 1-5 OR 10-15 OR 20-25 + IntConstraint combined = range1.or(range2).or(range3); + + // Valid ranges: 1-5, 10-15, 20-25 + assertTrue(combined.isApplicable(3)); + assertTrue(combined.isApplicable(12)); + assertTrue(combined.isApplicable(22)); + assertFalse(combined.isApplicable(6)); + assertFalse(combined.isApplicable(9)); + assertFalse(combined.isApplicable(26)); + + assertArrayEquals( + new IntConstraints.Range[] { + new IntConstraints.Range(1, 5), + new IntConstraints.Range(10, 15), + new IntConstraints.Range(20, 25) }, + combined.trueRanges() + ); + } + + public void testComplexNestedCombination_1() { + IntConstraint range1 = IntConstraints.parseIntConstraint("1-10"); + IntConstraint range2 = IntConstraints.parseIntConstraint("5-15"); + IntConstraint range3 = IntConstraints.parseIntConstraint("20-30"); + IntConstraint range4 = IntConstraints.parseIntConstraint("25-35"); + + // ((1-10 AND 5-15) OR 20-30) AND 25-35 + IntConstraint combined = range1.and(range2).or(range3).and(range4); + + // Valid range: 25-30 (from OR of 1-10 AND 5-15 and 20-30, intersected with 25-35) + assertTrue(combined.isApplicable(25)); + assertTrue(combined.isApplicable(30)); + assertFalse(combined.isApplicable(24)); + assertFalse(combined.isApplicable(31)); + + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(25, 30) }, combined.trueRanges()); + } + + public void testComplexNestedCombination_2() { + IntConstraint range1 = IntConstraints.parseIntConstraint("5-10"); + IntConstraint range2 = IntConstraints.parseIntConstraint("8-12"); + IntConstraint range3 = IntConstraints.parseIntConstraint("18-30"); + IntConstraint range4 = IntConstraints.parseIntConstraint("1-20"); + + // (5-10 OR 8-12 OR 18-30) AND (1-20) + IntConstraint combined = range1.or(range2).or(range3).and(range4); + + // Valid ranges: 5-12 and 18-20 + assertTrue(combined.isApplicable(5)); + assertTrue(combined.isApplicable(8)); + assertTrue(combined.isApplicable(9)); + assertTrue(combined.isApplicable(10)); + assertTrue(combined.isApplicable(12)); + assertTrue(combined.isApplicable(18)); + assertTrue(combined.isApplicable(20)); + assertFalse(combined.isApplicable(4)); + assertFalse(combined.isApplicable(13)); + assertFalse(combined.isApplicable(17)); + assertFalse(combined.isApplicable(21)); + + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(5, 12), new IntConstraints.Range(18, 20) }, + combined.trueRanges() + ); + } + + public void testComplexNestedCombination_3() { + IntConstraint range1 = IntConstraints.parseIntConstraint("(-50)-(-10)"); + IntConstraint range2 = IntConstraints.parseIntConstraint("(-30)-(-10)"); + IntConstraint range3 = IntConstraints.parseIntConstraint("(-10)-100"); + IntConstraint equals1 = IntConstraints.parseIntConstraint("==(-10)"); + + // (-50)-(-10) AND (-30)-(-10) AND ==(-10) AND (-10)-100 + IntConstraint combined = range1.and(range2).and(equals1).and(range3); + + // only -10 is valid + assertTrue(combined.isApplicable(-10)); + assertFalse(combined.isApplicable(-50)); + assertFalse(combined.isApplicable(-30)); + assertFalse(combined.isApplicable(-11)); + assertFalse(combined.isApplicable(-9)); + + assertArrayEquals(new IntConstraints.Range[] { new IntConstraints.Range(-10, -10) }, combined.trueRanges()); + + IntConstraint notEquals1 = IntConstraints.parseIntConstraint("!=10"); + combined = combined.or(notEquals1); + + // everything except 10 is valid + assertFalse(combined.isApplicable(10)); + assertTrue(combined.isApplicable(9)); + assertTrue(combined.isApplicable(11)); + assertTrue(combined.isApplicable(-10)); + assertTrue(combined.isApplicable(-50)); + assertTrue(combined.isApplicable(Integer.MIN_VALUE)); + assertTrue(combined.isApplicable(Integer.MAX_VALUE)); + + assertArrayEquals( + new IntConstraints.Range[] { new IntConstraints.Range(Integer.MIN_VALUE, 9), new IntConstraints.Range(11, Integer.MAX_VALUE) }, + combined.trueRanges() + ); + } +} diff --git a/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraintsTests.java b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraintsTests.java new file mode 100644 index 0000000000000..c528bd9ab9708 --- /dev/null +++ b/x-pack/plugin/logsdb/src/test/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/schema/constraints/StringConstraintsTests.java @@ -0,0 +1,189 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.constraints; + +import org.elasticsearch.test.ESTestCase; + +import java.util.Set; + +import static org.hamcrest.Matchers.instanceOf; + +public class StringConstraintsTests extends ESTestCase { + + public void testEqualsConstraint() { + StringConstraint constraint = new EqualsStringConstraint("test"); + assertTrue(constraint.isApplicable("test")); + assertFalse(constraint.isApplicable("Test")); + assertFalse(constraint.isApplicable("testing")); + } + + public void testNotEqualsConstraint() { + StringConstraint constraint = new NotEqualsStringConstraint("test"); + assertTrue(constraint.isApplicable("Test")); + assertTrue(constraint.isApplicable("testing")); + assertFalse(constraint.isApplicable("test")); + } + + public void testSetConstraint() { + StringConstraint constraint = new StringSetConstraint(Set.of("One", "Two", "Three")); + assertTrue(constraint.isApplicable("One")); + assertTrue(constraint.isApplicable("Two")); + assertTrue(constraint.isApplicable("Three")); + assertFalse(constraint.isApplicable("Four")); + } + + public void testMapConstraintWithParsing() { + StringConstraint constraint = StringConstraints.parseStringConstraint("one=1| two=2| three = 3 "); + assertThat(constraint, instanceOf(StringToIntMapConstraint.class)); + assertTrue(constraint.isApplicable("one")); + assertTrue(constraint.isApplicable("two")); + assertTrue(constraint.isApplicable("three")); + assertFalse(constraint.isApplicable("four")); + } + + public void testLengthConstraint() { + StringConstraint constraint = new LengthStringConstraint(3); + assertTrue(constraint.isApplicable("abc")); + assertTrue(constraint.isApplicable("123")); + assertFalse(constraint.isApplicable("abcd")); + assertFalse(constraint.isApplicable("ab")); + } + + public void testAndConstraint() { + StringConstraint constraint1 = new EqualsStringConstraint("test"); + StringConstraint constraint2 = new LengthStringConstraint(4); + StringConstraint combined = constraint1.and(constraint2); + + assertThat(combined, instanceOf(AndStringConstraint.class)); + AndStringConstraint andConstraint = (AndStringConstraint) combined; + assertEquals(constraint1, andConstraint.first()); + assertEquals(constraint2, andConstraint.second()); + + assertTrue(combined.isApplicable("test")); + assertFalse(combined.isApplicable("Test")); + assertFalse(combined.isApplicable("testing")); + } + + public void testOrConstraint() { + StringConstraint constraint1 = new EqualsStringConstraint("test"); + StringConstraint constraint2 = new EqualsStringConstraint("Test"); + StringConstraint combined = constraint1.or(constraint2); + + assertThat(combined, instanceOf(OrStringConstraint.class)); + OrStringConstraint orConstraint = (OrStringConstraint) combined; + assertEquals(constraint1, orConstraint.first()); + assertEquals(constraint2, orConstraint.second()); + + assertTrue(combined.isApplicable("test")); + assertTrue(combined.isApplicable("Test")); + assertFalse(combined.isApplicable("testing")); + } + + public void testNullOrEmptyConstraint() { + assertEquals(AnyString.INSTANCE, StringConstraints.parseStringConstraint(null)); + assertEquals(AnyString.INSTANCE, StringConstraints.parseStringConstraint("")); + assertEquals(AnyString.INSTANCE, StringConstraints.parseStringConstraint(" ")); + } + + public void testWhitespaceHandling() { + StringConstraint standard = StringConstraints.parseStringConstraint("One|Two"); + assertThat(standard, instanceOf(StringSetConstraint.class)); + StringConstraint withSpaces = StringConstraints.parseStringConstraint(" One | Two "); + assertThat(withSpaces, instanceOf(StringSetConstraint.class)); + StringConstraint manySpaces = StringConstraints.parseStringConstraint(" One | Two "); + assertThat(manySpaces, instanceOf(StringSetConstraint.class)); + + assertTrue(standard.isApplicable("One")); + assertTrue(withSpaces.isApplicable("Two")); + assertTrue(manySpaces.isApplicable("One")); + assertFalse(standard.isApplicable("Three")); + assertFalse(withSpaces.isApplicable("Three")); + assertFalse(manySpaces.isApplicable("Three")); + } + + public void testComplexNestedConstraint() { + StringConstraint constraint1 = StringConstraints.parseStringConstraint("One|Two"); + StringConstraint constraint2 = StringConstraints.parseStringConstraint("Three|Four"); + StringConstraint combined = constraint1.or(constraint2); + + assertTrue(combined.isApplicable("One")); + assertTrue(combined.isApplicable("Three")); + assertFalse(combined.isApplicable("Five")); + } + + public void testGetValidCharactersEqualsConstraint() { + StringConstraint constraint = new EqualsStringConstraint("test"); + char[] validChars = constraint.getValidCharacters(); + + assertNotNull(validChars); + assertEquals(3, validChars.length); // 't', 'e', 's' + String result = new String(validChars); + assertTrue(result.contains("t")); + assertTrue(result.contains("e")); + assertTrue(result.contains("s")); + } + + public void testGetValidCharactersSetConstraint() { + StringConstraint constraint = new StringSetConstraint(Set.of("One", "Two", "Three")); + char[] validChars = constraint.getValidCharacters(); + + assertNotNull(validChars); + assertEquals(8, validChars.length); // 'O', 'n', 'e', 'T', 'w', 'o', 'h', 'r' + String result = new String(validChars); + assertTrue(result.contains("O")); + assertTrue(result.contains("n")); + assertTrue(result.contains("e")); + assertTrue(result.contains("T")); + assertTrue(result.contains("w")); + assertTrue(result.contains("o")); + assertTrue(result.contains("h")); + assertTrue(result.contains("r")); + } + + public void testGetValidCharactersLengthConstraint() { + StringConstraint constraint = new LengthStringConstraint(3); + char[] validChars = constraint.getValidCharacters(); + + assertNull(validChars); // Length constraint does not define valid characters + } + + public void testGetValidCharactersAndConstraint() { + StringConstraint constraint1 = new EqualsStringConstraint("test"); + StringConstraint constraint2 = new StringSetConstraint(Set.of("test", "testing")); + StringConstraint combined = constraint1.and(constraint2); + + char[] validChars = combined.getValidCharacters(); + + assertNotNull(validChars); + assertEquals(3, validChars.length); // Intersection: 't', 'e', 's' + String result = new String(validChars); + assertTrue(result.contains("t")); + assertTrue(result.contains("e")); + assertTrue(result.contains("s")); + } + + public void testGetValidCharactersOrConstraint() { + StringConstraint constraint1 = new EqualsStringConstraint("test"); + StringConstraint constraint2 = new StringSetConstraint(Set.of("One", "Two")); + StringConstraint combined = constraint1.or(constraint2); + + char[] validChars = combined.getValidCharacters(); + + assertNotNull(validChars); + assertEquals(8, validChars.length); // Union: 't', 'e', 's', 'O', 'n', 'T', 'w', 'o' + String result = new String(validChars); + assertTrue(result.contains("t")); + assertTrue(result.contains("e")); + assertTrue(result.contains("s")); + assertTrue(result.contains("O")); + assertTrue(result.contains("n")); + assertTrue(result.contains("T")); + assertTrue(result.contains("w")); + assertTrue(result.contains("o")); + } +}