77
88package org .elasticsearch .xpack .logsdb .patternedtext .charparser .parser ;
99
10+ import org .elasticsearch .xpack .logsdb .patternedtext .charparser .api .DataLossParseException ;
11+ import org .elasticsearch .xpack .logsdb .patternedtext .charparser .api .ParseException ;
1012import org .elasticsearch .xpack .logsdb .patternedtext .charparser .api .Parser ;
1113import org .elasticsearch .xpack .logsdb .patternedtext .charparser .common .EncodingType ;
1214import org .elasticsearch .xpack .logsdb .patternedtext .charparser .compiler .CompiledSchema ;
@@ -78,6 +80,8 @@ public final class CharParser implements Parser {
7880 private final StringBuilder patternedMessage = new StringBuilder ();
7981 private final List <Argument <?>> arguments = new ArrayList <>();
8082 private Timestamp timestamp = null ;
83+ // a character counter that is used to verify data integrity
84+ int totalCharsAttributedToArgs ;
8185
8286 // current subToken state
8387 private int currentSubTokenStartIndex ;
@@ -97,10 +101,13 @@ public final class CharParser implements Parser {
97101 private final char [] bufferedSubTokenDelimiters ;
98102
99103 // current multi-token state
104+ private int currentMultiTokenStartIndex ;
100105 int currentMultiTokenBitmask ;
101106 int currentTokenIndex ;
102107 final TokenType [] bufferedTokens ;
103108 final int [] bufferedTokenBitmasks ;
109+ final int [] bufferedTokenStartIndexes ;
110+ final int [] bufferedTokenLengths ;
104111 private final char [] bufferedTokenDelimiters ;
105112 private final int [] currentMultiTokenSubTokenValues ;
106113 private int currentMultiTokenSubTokenIndex ;
@@ -126,6 +133,8 @@ public CharParser(CompiledSchema compiledSchema) {
126133 bufferedSubTokenLengths = new int [compiledSchema .maxSubTokensPerToken + 1 ];
127134 bufferedTokens = new TokenType [compiledSchema .maxTokensPerMultiToken + 1 ];
128135 bufferedTokenBitmasks = new int [compiledSchema .maxTokensPerMultiToken + 1 ];
136+ bufferedTokenStartIndexes = new int [compiledSchema .maxTokensPerMultiToken + 1 ];
137+ bufferedTokenLengths = new int [compiledSchema .maxTokensPerMultiToken + 1 ];
129138 bufferedTokenDelimiters = new char [compiledSchema .maxTokensPerMultiToken + 1 ];
130139 }
131140
@@ -145,6 +154,7 @@ private void resetTokenState() {
145154 }
146155
147156 private void resetMultiTokenState () {
157+ currentMultiTokenStartIndex = -1 ;
148158 currentTokenIndex = -1 ;
149159 currentMultiTokenBitmask = allMultiTokenBitmask ;
150160 currentMultiTokenSubTokenIndex = 0 ;
@@ -155,6 +165,7 @@ private void reset() {
155165 patternedMessage .setLength (0 );
156166 arguments .clear ();
157167 timestamp = null ;
168+ totalCharsAttributedToArgs = 0 ;
158169 resetSubTokenState ();
159170 resetTokenState ();
160171 resetMultiTokenState ();
@@ -215,7 +226,7 @@ private void reset() {
215226 * @param rawMessage the input message to parse
216227 * @return a {@link PatternedMessage} containing the pattern template, timestamp, and typed arguments
217228 */
218- public PatternedMessage parse (String rawMessage ) {
229+ public PatternedMessage parse (String rawMessage ) throws ParseException {
219230 if (rawMessage == null || rawMessage .isEmpty ()) {
220231 return new PatternedMessage ("" , null , new Argument <?>[0 ]);
221232 }
@@ -344,6 +355,13 @@ public PatternedMessage parse(String rawMessage) {
344355
345356 // handle token finalization
346357 int formerMultiTokenBitmask = currentMultiTokenBitmask ;
358+ int formerMultiTokenEndIndex ;
359+ if (currentTokenIndex >= 0 ) {
360+ formerMultiTokenEndIndex = bufferedTokenStartIndexes [currentTokenIndex ] + bufferedTokenLengths [currentTokenIndex ];
361+ } else {
362+ formerMultiTokenEndIndex = indexWithinRawMessage ;
363+ }
364+
347365 if (charType == TOKEN_DELIMITER_CHAR_CODE || charType == LINE_END_CODE ) {
348366 int currentTokenLength = indexWithinRawMessage - currentTokenStartIndex ;
349367 if (currentTokenLength == 0 ) {
@@ -361,8 +379,12 @@ public PatternedMessage parse(String rawMessage) {
361379 bufferedTokens [currentTokenIndex ] = currentToken ;
362380 bufferedTokenDelimiters [currentTokenIndex ] = currentChar ;
363381 bufferedTokenBitmasks [currentTokenIndex ] = currentTokenBitmask ;
382+ bufferedTokenStartIndexes [currentTokenIndex ] = currentTokenStartIndex ;
383+ bufferedTokenLengths [currentTokenIndex ] = currentTokenLength ;
364384
365- if (currentTokenIndex == compiledSchema .maxTokensPerMultiToken ) {
385+ if (currentTokenIndex == 0 ) {
386+ currentMultiTokenStartIndex = currentTokenStartIndex ;
387+ } else if (currentTokenIndex == compiledSchema .maxTokensPerMultiToken ) {
366388 // we already passed the maximum number of tokens for any known multi-token
367389 currentMultiTokenBitmask = 0 ;
368390 } else {
@@ -436,11 +458,17 @@ public PatternedMessage parse(String rawMessage) {
436458 if (multiTokenType .encodingType () == EncodingType .TIMESTAMP ) {
437459 createAndStoreTimestamp (multiTokenType );
438460 } else {
439- throw new IllegalStateException ("Unknown multi-token type: " + multiTokenType .name ());
461+ throw new ParseException ("Unknown multi-token type: " + multiTokenType .name ());
440462 }
463+
464+ int multiTokenLength = formerMultiTokenEndIndex - currentMultiTokenStartIndex ;
465+ totalCharsAttributedToArgs += multiTokenLength - 2 ; // deducting 2 for the %T placeholder
466+
441467 // now fixing the buffers so that the last token becomes the only buffered token
442468 bufferedTokens [0 ] = bufferedTokens [currentTokenIndex ];
443469 bufferedTokenBitmasks [0 ] = bufferedTokenBitmasks [currentTokenIndex ];
470+ bufferedTokenStartIndexes [0 ] = bufferedTokenStartIndexes [currentTokenIndex ];
471+ bufferedTokenLengths [0 ] = bufferedTokenLengths [currentTokenIndex ];
444472 bufferedTokenDelimiters [0 ] = bufferedTokenDelimiters [currentTokenIndex ];
445473 currentTokenIndex = 0 ;
446474 currentMultiTokenSubTokenIndex = 0 ;
@@ -461,16 +489,26 @@ public PatternedMessage parse(String rawMessage) {
461489 case INTEGER -> new IntegerArgument (bufferedSubTokenIntValues [0 ]);
462490 case HEX -> new HexadecimalArgument (
463491 rawMessage ,
464- bufferedSubTokenStartIndexes [ 0 ],
465- bufferedSubTokenLengths [ 0 ]
492+ bufferedTokenStartIndexes [ i ],
493+ bufferedTokenLengths [ i ]
466494 );
467- case IPV4 -> new IPv4Argument (bufferedSubTokenIntValues );
468- case UUID -> new UUIDArgument (rawMessage , currentTokenStartIndex , indexWithinRawMessage );
495+ case IPV4 -> {
496+ if (currentTokenIndex == 0 ) {
497+ // IPv4 tokens can only be part of a single token, so we can safely create an IPv4 argument
498+ yield new IPv4Argument (bufferedSubTokenIntValues );
499+ } else {
500+ throw new ParseException (
501+ "IPV4 token cannot be part of a multi-token, but found at position " + i
502+ );
503+ }
504+ }
505+ case UUID -> new UUIDArgument (rawMessage , bufferedTokenStartIndexes [i ], bufferedTokenLengths [i ]);
469506 default -> null ;
470507 };
471508 if (argument != null ) {
472509 arguments .add (argument );
473510 patternedMessage .append (ARGUMENT_PLACEHOLDER_PREFIX ).append (argument .type ().getSymbol ());
511+ totalCharsAttributedToArgs += bufferedTokenLengths [i ] - 2 ; // 2 for symbols
474512 } else {
475513 // todo
476514 }
@@ -511,6 +549,7 @@ public PatternedMessage parse(String rawMessage) {
511549 if (argument != null ) {
512550 arguments .add (argument );
513551 patternedMessage .append (ARGUMENT_PLACEHOLDER_PREFIX ).append (argument .type ().getSymbol ());
552+ totalCharsAttributedToArgs += bufferedSubTokenLengths [i ] - 2 ; // deducting 2 for the % prefix and symbol
514553 } else {
515554 patternedMessage .append (rawMessage , bufferedSubTokenStartIndexes [i ], indexWithinRawMessage );
516555 }
@@ -534,6 +573,11 @@ public PatternedMessage parse(String rawMessage) {
534573 default :
535574 }
536575 }
576+
577+ int consumedCharsForPatternedMessage = patternedMessage .length () + totalCharsAttributedToArgs ;
578+ if (consumedCharsForPatternedMessage != rawMessage .length ()) {
579+ throw new DataLossParseException ("Data loss detected during parsing" , rawMessage .length (), consumedCharsForPatternedMessage );
580+ }
537581 return new PatternedMessage (patternedMessage .toString (), timestamp , arguments .toArray (new Argument <?>[0 ]));
538582 }
539583
0 commit comments