Skip to content

Commit e251834

Browse files
authored
Fix numbered token bug and make it optional output in patterns command (#4402)
* Fix patterns command incorrectly parsing token and add parameter option Signed-off-by: Songkan Tang <[email protected]> * Fix some cases after merge Signed-off-by: Songkan Tang <[email protected]> * Fix failed tests Signed-off-by: Songkan Tang <[email protected]> * Fix UTs and modify docs Signed-off-by: Songkan Tang <[email protected]> * Add Brain parameter explanation to the doc Signed-off-by: Songkan Tang <[email protected]> * Modify patterns doctest after calcite default engine enabled Signed-off-by: Songkan Tang <[email protected]> * Rephrase some words in patterns.rst Signed-off-by: Songkan Tang <[email protected]> --------- Signed-off-by: Songkan Tang <[email protected]>
1 parent e2df87b commit e251834

File tree

39 files changed

+1023
-417
lines changed

39 files changed

+1023
-417
lines changed

common/src/main/java/org/opensearch/sql/common/patterns/BrainLogParser.java

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -172,8 +172,11 @@ public BrainLogParser(
172172
* @return list of tokens by splitting preprocessed log message
173173
*/
174174
public List<String> preprocess(String logMessage, String logId) {
175-
if (logMessage == null || logId == null) {
176-
throw new IllegalArgumentException("log message or logId must not be null");
175+
if (logId == null) {
176+
throw new IllegalArgumentException("logId must not be null");
177+
}
178+
if (logMessage == null) {
179+
return Arrays.asList("", logId);
177180
}
178181

179182
List<String> tokens = preprocess(logMessage, this.filterPatternVariableMap, this.delimiters);
@@ -224,7 +227,7 @@ public void processTokenHistogram(List<String> tokens) {
224227
* @return list of token lists
225228
*/
226229
public List<List<String>> preprocessAllLogs(List<String> logMessages) {
227-
List<List<String>> preprocessedLogs = new ArrayList<>();
230+
List<List<String>> preprocessedLogs = new ArrayList<>(logMessages.size());
228231

229232
for (int i = 0; i < logMessages.size(); i++) {
230233
String logId = String.valueOf(i);
@@ -291,7 +294,8 @@ public static List<String> parseLogPattern(
291294
String groupCandidateStr = logIdGroupCandidateMap.get(logId);
292295
String[] groupCandidate = groupCandidateStr.split(",");
293296
Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group
294-
return IntStream.range(0, tokens.size() - 1)
297+
int tokenCapacity = Math.max(0, tokens.size() - 1);
298+
return IntStream.range(0, tokenCapacity)
295299
.mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i)))
296300
.map(
297301
entry -> {
@@ -334,7 +338,7 @@ public static List<String> parseLogPattern(
334338
}
335339
return token;
336340
})
337-
.collect(Collectors.toList());
341+
.collect(Collectors.toCollection(() -> new ArrayList<>(tokenCapacity)));
338342
}
339343

340344
/**
@@ -349,7 +353,10 @@ public Map<String, Map<String, Object>> parseAllLogPatterns(
349353

350354
Map<String, Map<String, Object>> logPatternMap = new HashMap<>();
351355
for (int i = 0; i < processedMessages.size(); i++) {
352-
List<String> logPattern = this.parseLogPattern(processedMessages.get(i));
356+
List<String> logPattern =
357+
this.parseLogPattern(processedMessages.get(i)).stream()
358+
.map(BrainLogParser::collapseContinuousWildcards)
359+
.collect(Collectors.toList());
353360
String patternKey = String.join(" ", logPattern);
354361
String sampleLog = logMessages.get(i);
355362
logPatternMap.compute(
@@ -379,6 +386,29 @@ public Map<String, Map<String, Object>> parseAllLogPatterns(
379386
return logPatternMap;
380387
}
381388

389+
static String collapseContinuousWildcards(String part) {
390+
// The minimum of continuous wildcards are 6 characters: <*><*>
391+
if (part == null || part.length() < 6) {
392+
return part;
393+
}
394+
int tokenLen = VARIABLE_DENOTER.length();
395+
StringBuilder sb = new StringBuilder(part.length());
396+
int i = 0;
397+
while (i < part.length()) {
398+
int j = part.indexOf(VARIABLE_DENOTER, i);
399+
if (j < 0) {
400+
sb.append(part, i, part.length());
401+
break;
402+
}
403+
sb.append(part, i, j).append(VARIABLE_DENOTER);
404+
do {
405+
j += tokenLen;
406+
} while (j <= part.length() - tokenLen && part.startsWith(VARIABLE_DENOTER, j));
407+
i = j;
408+
}
409+
return sb.toString();
410+
}
411+
382412
private Map<Long, Integer> getWordOccurrences(List<String> tokens) {
383413
Map<Long, Integer> occurrences = new HashMap<>();
384414
for (int i = 0; i < tokens.size() - 1; i++) {

common/src/main/java/org/opensearch/sql/common/patterns/PatternUtils.java

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
package org.opensearch.sql.common.patterns;
77

8+
import com.google.common.collect.ImmutableList;
89
import java.util.ArrayList;
910
import java.util.List;
1011
import java.util.Map;
@@ -20,6 +21,8 @@ public final class PatternUtils {
2021
public static final Pattern WILDCARD_PATTERN = Pattern.compile("<\\*[^>]*>");
2122
public static final String TOKEN_PREFIX = "<token";
2223
public static final Pattern TOKEN_PATTERN = Pattern.compile("<token\\d+>");
24+
public static final List<String> VALID_BRAIN_PARAMETERS =
25+
ImmutableList.of("variable_count_threshold", "frequency_threshold_percentage");
2326

2427
public static Map<String, Map<String, Object>> mergePatternGroups(
2528
Map<String, Map<String, Object>> left,
@@ -55,6 +58,7 @@ public static Map<String, Map<String, Object>> mergePatternGroups(
5558
public static void extractVariables(
5659
ParseResult parseResult, String original, Map<String, List<String>> result, String prefix) {
5760
List<String> parts = parseResult.parts;
61+
List<Boolean> isToken = parseResult.isToken;
5862
List<String> tokenOrder = parseResult.tokenOrder;
5963

6064
if (parts.isEmpty()) {
@@ -67,7 +71,7 @@ public static void extractVariables(
6771

6872
while (i < parts.size()) {
6973
String currentPart = parts.get(i);
70-
if (currentPart.startsWith(prefix)) { // Process already labeled part
74+
if (isToken.get(i)) { // Process already labeled part
7175
String tokenKey = tokenOrder.get(tokenIndex++);
7276
if (i == parts.size() - 1) { // The last part
7377
String value = original.substring(pos);
@@ -97,19 +101,22 @@ public static void extractVariables(
97101
}
98102

99103
public static class ParseResult {
100-
List<String> parts;
101-
List<String> tokenOrder;
104+
final List<String> parts;
105+
final List<Boolean> isToken;
106+
final List<String> tokenOrder;
102107

103-
public ParseResult(List<String> parts, List<String> tokenOrder) {
108+
public ParseResult(List<String> parts, List<Boolean> isToken, List<String> tokenOrder) {
104109
this.parts = parts;
110+
this.isToken = isToken;
105111
this.tokenOrder = tokenOrder;
106112
}
107113

108114
public String toTokenOrderString(String prefix) {
109115
StringBuilder result = new StringBuilder();
110116
int tokenIndex = 0;
111-
for (String currentPart : parts) {
112-
if (currentPart.startsWith(prefix)) {
117+
for (int i = 0; i < parts.size(); i++) {
118+
String currentPart = parts.get(i);
119+
if (isToken.get(i)) {
113120
result.append(tokenOrder.get(tokenIndex++));
114121
} else {
115122
result.append(currentPart);
@@ -126,6 +133,7 @@ public String toTokenOrderString(String prefix) {
126133
*/
127134
public static ParseResult parsePattern(String pattern, Pattern compiledPattern) {
128135
List<String> parts = new ArrayList<>();
136+
List<Boolean> isToken = new ArrayList<>();
129137
List<String> tokenOrder = new ArrayList<>();
130138
Matcher matcher = compiledPattern.matcher(pattern);
131139
int lastEnd = 0;
@@ -137,20 +145,23 @@ public static ParseResult parsePattern(String pattern, Pattern compiledPattern)
137145
// Add static part before the found match if there is
138146
if (start > lastEnd) {
139147
parts.add(pattern.substring(lastEnd, start));
148+
isToken.add(false);
140149
}
141150
// Add matched wildcard part and generate token order key
142151
String wildcard = matcher.group();
143152
parts.add(wildcard);
153+
isToken.add(true);
144154
tokenOrder.add("<token" + tokenCount++ + ">");
145155
lastEnd = end;
146156
}
147157

148158
// Add static part at the end
149159
if (lastEnd < pattern.length()) {
150160
parts.add(pattern.substring(lastEnd));
161+
isToken.add(false);
151162
}
152163

153-
return new ParseResult(parts, tokenOrder);
164+
return new ParseResult(parts, isToken, tokenOrder);
154165
}
155166

156167
private static void addToResult(Map<String, List<String>> result, String key, String value) {

common/src/main/java/org/opensearch/sql/common/setting/Settings.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ public enum Key {
2929
PATTERN_MODE("plugins.ppl.pattern.mode"),
3030
PATTERN_MAX_SAMPLE_COUNT("plugins.ppl.pattern.max.sample.count"),
3131
PATTERN_BUFFER_LIMIT("plugins.ppl.pattern.buffer.limit"),
32+
PATTERN_SHOW_NUMBERED_TOKEN("plugins.ppl.pattern.show.numbered.token"),
3233
PPL_REX_MAX_MATCH_LIMIT("plugins.ppl.rex.max_match.limit"),
3334
PPL_VALUES_MAX_LIMIT("plugins.ppl.values.max.limit"),
3435
PPL_SYNTAX_LEGACY_PREFERRED("plugins.ppl.syntax.legacy.preferred"),

common/src/test/java/org/opensearch/sql/common/patterns/BrainLogParserTest.java

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import static org.junit.Assert.assertThrows;
1111
import static org.junit.Assert.assertTrue;
1212

13+
import com.google.common.collect.ImmutableList;
1314
import com.google.common.collect.ImmutableMap;
1415
import java.util.AbstractMap;
1516
import java.util.Arrays;
@@ -104,6 +105,15 @@ public void testPreprocess() {
104105
assertEquals(expectedResult, result);
105106
}
106107

108+
@Test
109+
public void testPreprocessNullString() {
110+
String logMessage = null;
111+
String logId = "log1";
112+
List<String> expectedResult = Arrays.asList("", "log1");
113+
List<String> result = parser.preprocess(logMessage, logId);
114+
assertEquals(expectedResult, result);
115+
}
116+
107117
@Test
108118
public void testPreprocessWithUUID() {
109119
String logMessage = "127.0.0.1 - 1234 something, user_id:c78ac970-f0c3-4954-8cf8-352a8458d01c";
@@ -124,15 +134,11 @@ public void testPreprocessWithUUID() {
124134
public void testPreprocessWithIllegalInput() {
125135
String logMessage = "127.0.0.1 - 1234 something";
126136
String logId = "log1";
127-
String exceptionMessage = "log message or logId must not be null";
137+
String exceptionMessage = "logId must not be null";
138+
assertEquals(ImmutableList.of("", logId), parser.preprocess(null, logId));
128139
Throwable throwable =
129-
assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, logId));
130-
assertEquals(exceptionMessage, throwable.getMessage());
131-
throwable =
132140
assertThrows(IllegalArgumentException.class, () -> parser.preprocess(logMessage, null));
133141
assertEquals(exceptionMessage, throwable.getMessage());
134-
throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, null));
135-
assertEquals(exceptionMessage, throwable.getMessage());
136142
}
137143

138144
@Test
@@ -209,6 +215,29 @@ public void testParseLogPattern() {
209215
assertEquals(expectedLogPattern, logPattern);
210216
}
211217

218+
@Test
219+
public void testParseAllLogPatternsWithNullInput() {
220+
List<String> messages =
221+
Arrays.asList(
222+
null,
223+
"PacketResponder failed for blk_6996194389878584395",
224+
"PacketResponder failed for blk_-1547954353065580372");
225+
Map<String, Map<String, Object>> logPatternMap = parser.parseAllLogPatterns(messages, 1);
226+
Map<String, Map<String, Object>> expectedResult =
227+
ImmutableMap.of(
228+
"",
229+
ImmutableMap.of("pattern_count", 1L, "pattern", "", "sample_logs", ImmutableList.of()),
230+
"PacketResponder failed for blk_<*>",
231+
ImmutableMap.of(
232+
"pattern_count",
233+
2L,
234+
"pattern",
235+
"PacketResponder failed for blk_<*>",
236+
"sample_logs",
237+
ImmutableList.of("PacketResponder failed for blk_6996194389878584395")));
238+
assertEquals(expectedResult, logPatternMap);
239+
}
240+
212241
@Test
213242
public void testParseAllLogPatterns() {
214243
Map<String, Map<String, Object>> logPatternMap = parser.parseAllLogPatterns(TEST_HDFS_LOGS, 2);
@@ -286,6 +315,19 @@ public void testParseLogPatternWhenHigherFrequencyTokenIsVariable() {
286315
assertTrue(parser.getGroupTokenSetMap().get("4-3,3-0").size() > 1);
287316
}
288317

318+
@Test
319+
public void testCollapseContinuousWildcards() {
320+
String correctTokenPattern =
321+
"BLOCK* NameSystem.allocateBlock: /user/root/_temporary/_task_<*>_r_<*>";
322+
String continuousTokenPattern =
323+
"BLOCK* NameSystem.allocateBlock: /user/root/_temporary/_task_<*><*>_r_<*><*><*>";
324+
325+
assertEquals(
326+
correctTokenPattern, BrainLogParser.collapseContinuousWildcards(continuousTokenPattern));
327+
assertEquals(
328+
correctTokenPattern, BrainLogParser.collapseContinuousWildcards(correctTokenPattern));
329+
}
330+
289331
private Map<String, Long> collectPatternByCountMap(
290332
Map<String, Map<String, Object>> logPatternMap) {
291333
return logPatternMap.entrySet().stream()

core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,7 @@ public static Patterns patterns(
577577
PatternMode patternMode,
578578
UnresolvedExpression patternMaxSampleCount,
579579
UnresolvedExpression patternBufferLimit,
580+
UnresolvedExpression showNumberedToken,
580581
java.util.Map<String, Literal> arguments) {
581582
return new Patterns(
582583
sourceField,
@@ -586,6 +587,7 @@ public static Patterns patterns(
586587
patternMode,
587588
patternMaxSampleCount,
588589
patternBufferLimit,
590+
showNumberedToken,
589591
arguments,
590592
input);
591593
}

core/src/main/java/org/opensearch/sql/ast/tree/Patterns.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ public class Patterns extends UnresolvedPlan {
3939
private final PatternMode patternMode;
4040
private final UnresolvedExpression patternMaxSampleCount;
4141
private final UnresolvedExpression patternBufferLimit;
42+
private final UnresolvedExpression showNumberedToken;
4243
private final Map<String, Literal> arguments;
4344
private UnresolvedPlan child;
4445

0 commit comments

Comments
 (0)