Skip to content

Commit 0a018c5

Browse files
committed
First phase handling trimmed characters
1 parent 6f21cd5 commit 0a018c5

File tree

11 files changed

+478
-110
lines changed

11 files changed

+478
-110
lines changed

benchmarks/src/main/java/org/elasticsearch/benchmark/index/mapper/PatternedTextParserBenchmark.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99

1010
package org.elasticsearch.benchmark.index.mapper;
1111

12-
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser;
13-
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParseException;
14-
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParserFactory;
1512
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Argument;
1613
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IPv4Argument;
1714
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.IntegerArgument;
15+
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParseException;
16+
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Parser;
17+
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.ParserFactory;
1818
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.PatternedMessage;
1919
import org.elasticsearch.xpack.logsdb.patternedtext.charparser.api.Timestamp;
2020
import org.openjdk.jmh.annotations.Benchmark;

x-pack/plugin/logsdb/src/main/java/org/elasticsearch/xpack/logsdb/patternedtext/charparser/compiler/SchemaCompiler.java

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131
import java.util.Comparator;
3232
import java.util.HashMap;
3333
import java.util.HashSet;
34+
import java.util.List;
3435
import java.util.Map;
3536
import java.util.Set;
36-
import java.util.StringJoiner;
3737
import java.util.function.Supplier;
3838

3939
import static org.elasticsearch.xpack.logsdb.patternedtext.charparser.common.CharCodes.ALPHABETIC_CHAR_CODE;
@@ -84,31 +84,31 @@ public static CompiledSchema compile(Schema schema) {
8484
BitmaskRegistry<MultiTokenType> multiTokenBitmaskRegistry = new BitmaskRegistry<>();
8585
for (org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.MultiTokenType multiTokenType : schema.getMultiTokenTypes()) {
8686
MultiTokenFormat format = multiTokenType.getFormat();
87-
org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType[] tokens = format.getTokens();
87+
List<org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType> tokens = format.getTokens();
8888

8989
TimestampFormat timestampFormat = null;
9090
if (multiTokenType.encodingType() == EncodingType.TIMESTAMP) {
91-
timestampFormat = createTimestampFormat(tokens);
91+
timestampFormat = createTimestampFormat(format);
9292
}
9393

9494
int subTokenCount = multiTokenType.getNumberOfSubTokens();
9595
int multiTokenBitmask = multiTokenBitmaskRegistry.register(
9696
new MultiTokenType(multiTokenType.name(), multiTokenType.encodingType(), subTokenCount, timestampFormat)
9797
);
9898

99-
maxTokensPerMultiToken = Math.max(maxTokensPerMultiToken, tokens.length);
99+
maxTokensPerMultiToken = Math.max(maxTokensPerMultiToken, tokens.size());
100100
maxSubTokensPerMultiToken = Math.max(maxSubTokensPerMultiToken, subTokenCount);
101101

102-
int bitmaskForTokenCount = tokenCountToMultiTokenBitmaskMap.computeIfAbsent(tokens.length, input -> 0);
102+
int bitmaskForTokenCount = tokenCountToMultiTokenBitmaskMap.computeIfAbsent(tokens.size(), input -> 0);
103103
bitmaskForTokenCount |= multiTokenBitmask;
104-
tokenCountToMultiTokenBitmaskMap.put(tokens.length, bitmaskForTokenCount);
104+
tokenCountToMultiTokenBitmaskMap.put(tokens.size(), bitmaskForTokenCount);
105105

106106
int bitmaskForSubTokenCount = subTokenCountToMultiTokenBitmaskMap.computeIfAbsent(subTokenCount, input -> 0);
107107
bitmaskForSubTokenCount |= multiTokenBitmask;
108108
subTokenCountToMultiTokenBitmaskMap.put(subTokenCount, bitmaskForSubTokenCount);
109109

110-
for (int i = 0; i < tokens.length; i++) {
111-
String tokenName = tokens[i].name();
110+
for (int i = 0; i < tokens.size(); i++) {
111+
String tokenName = tokens.get(i).name();
112112
ArrayList<Integer> bitmaskList = tokenTypeToMultiTokenBitmaskByPosition.computeIfAbsent(
113113
tokenName,
114114
input -> new ArrayList<>()
@@ -635,48 +635,56 @@ private record RangeBoundary(int boundary, boolean isLowerBound, int bitmask) {}
635635
// =================================================== Timestamp formatting ========================================================
636636

637637
/**
638-
* Creates a {@link TimestampFormat} from an array of {@link org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType}
639-
* objects that represent timestamp components.
640-
* @param formatTokens An array of TokenType objects representing the timestamp format.
638+
* Creates a {@link TimestampFormat} from a {@link MultiTokenFormat} object that represents timestamp components.
639+
* This method processes both token parts and literal string parts to construct the final Java time format.
640+
* @param format The MultiTokenFormat object representing the timestamp format.
641641
* @return A TimestampFormat object containing the format string and an array indicating the order of timestamp components.
642642
*/
643-
static TimestampFormat createTimestampFormat(org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType[] formatTokens) {
644-
StringJoiner javaTimeFormat = new StringJoiner(" ");
643+
static TimestampFormat createTimestampFormat(MultiTokenFormat format) {
644+
StringBuilder javaTimeFormat = new StringBuilder();
645645
int[] timestampComponentsOrder = new int[TimestampComponentType.values().length];
646646
Arrays.fill(timestampComponentsOrder, -1);
647647
int nextComponentIndex = 0;
648-
for (org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType token : formatTokens) {
649-
nextComponentIndex += appendTimestampComponents(token, javaTimeFormat, timestampComponentsOrder, nextComponentIndex);
648+
for (Object part : format.getFormatParts()) {
649+
if (part instanceof String) {
650+
for (char c : ((String) part).toCharArray()) {
651+
appendDelimiter(javaTimeFormat, c);
652+
}
653+
} else if (part instanceof org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType token) {
654+
StringBuilder tokenJavaTimeFormat = new StringBuilder();
655+
nextComponentIndex += appendTimestampComponents(token, tokenJavaTimeFormat, timestampComponentsOrder, nextComponentIndex);
656+
javaTimeFormat.append(tokenJavaTimeFormat);
657+
}
650658
}
651659
return new TimestampFormat(javaTimeFormat.toString(), timestampComponentsOrder);
652660
}
653661

654662
/**
655663
* Creates a {@link TimestampFormat} from a single {@link org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType}
656664
* object that represents a timestamp format.
657-
* @param formatToken A TokenType object representing the timestamp format.
665+
* @param timestampToken A TokenType object representing the timestamp format.
658666
* @return A TimestampFormat object containing the format string and an array indicating the order of timestamp components.
659667
*/
660-
static TimestampFormat createTimestampFormat(org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType formatToken) {
661-
StringJoiner javaTimeFormat = new StringJoiner("");
668+
static TimestampFormat createTimestampFormat(org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType timestampToken) {
669+
StringBuilder javaTimeFormat = new StringBuilder();
662670
int[] timestampComponentsOrder = new int[TimestampComponentType.values().length];
663671
Arrays.fill(timestampComponentsOrder, -1);
664-
appendTimestampComponents(formatToken, javaTimeFormat, timestampComponentsOrder, 0);
672+
appendTimestampComponents(timestampToken, javaTimeFormat, timestampComponentsOrder, 0);
665673
return new TimestampFormat(javaTimeFormat.toString(), timestampComponentsOrder);
666674
}
667675

668676
/**
669677
* Appends the details of a given token to the provided javaTimeFormat and updates the timestampComponentsOrder array to reflect the
670678
* order of timestamp components.
671679
* @param token the TokenType object representing the timestamp format
672-
* @param javaTimeFormat the StringJoiner to append the Java time format string
680+
* @param javaTimeFormat the StringBuilder to append the Java time format string
673681
* @param timestampComponentsOrder an array to store the order of timestamp components
674682
* @param nextComponentIndex the next index to use in the timestampComponentsOrder array
675683
* @return the number of timestamp components appended to the javaTimeFormat
676684
*/
677685
private static int appendTimestampComponents(
678686
org.elasticsearch.xpack.logsdb.patternedtext.charparser.schema.TokenType token,
679-
StringJoiner javaTimeFormat,
687+
StringBuilder javaTimeFormat,
680688
int[] timestampComponentsOrder,
681689
int nextComponentIndex
682690
) {
@@ -698,7 +706,7 @@ private static int appendTimestampComponents(
698706
appendedComponents++;
699707
}
700708
}
701-
javaTimeFormat.add(tokenJavaTimeFormat.toString());
709+
javaTimeFormat.append(tokenJavaTimeFormat);
702710
return appendedComponents;
703711
}
704712

0 commit comments

Comments
 (0)