Skip to content
Merged
28 changes: 28 additions & 0 deletions libs/grok/src/main/java/org/elasticsearch/grok/Grok.java
Original file line number Diff line number Diff line change
Expand Up @@ -257,4 +257,32 @@ public Regex getCompiledExpression() {
return compiledExpression;
}

public static String combinePatterns(List<String> patterns) {
return combinePatterns(patterns, null);
}

public static String combinePatterns(List<String> patterns, String traceMatchKey) {
String combinedPattern;
if (patterns.size() > 1) {
combinedPattern = "";
for (int i = 0; i < patterns.size(); i++) {
String pattern = patterns.get(i);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit of an edge case, but what would happen if some pattern is invalid? For example:

  1. %{WORD:word}\
  2. a)
  3. %{WORD:word}

The final result would be: (?:%{WORD:word}\)|(?:a))|(?:%{WORD:word})

Which lead to unexpected patterns.

This is user-made, so I don't think this is very problematic, but this looks like a grok injection and the method combinePatterns() would actually be lying here.

So, solutions:

  • Can we verify and warn on wrong patterns? Is that something we do in some other case?
  • Can we sanitize or remove invalid patterns?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whatever the resolution, we'll need tests for this, both in Grok and in ESQL, depending on what we do

Copy link
Contributor Author

@flash1293 flash1293 Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a fun one - tested with ingest pipelines and this indeed works here:

POST /_ingest/pipeline/_simulate
{
  "docs": [
    {
      "_source": {
        "foo": "Test)x"
      }
    }
  ],
  "pipeline": {
    "processors": [
      {
        "grok": {
          "field": "foo",
          "patterns": [
            "%{WORD:word}\\",
            "x)"
          ]
        }
      }
    ]
  }
}

should fail because %{WORD:word}\\ and x) are not valid patterns, but it does pass, no warnings or similar.

I don't feel strongly about this case, we could pass the individual patterns down further and then validate this in the Grok lib, but:

  • It seems like a lot of work for an edge case
  • It's existing behavior in ingest pipelines
  • We would need to compile each regex individually, which sounds like it would add a bunch of additional computation we are avoiding now

I would lean towards leaving the current behavior, wdyt?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the code was centralized, I guess the pipeline uses (and already used) the same logic right?

Being pragmatic, validating multiple patterns should error (probably?), which means that validating a single pattern should error too. So technically this is existing behavior, and fixing would be a breaking change (Or a bug fix).
So yeah, let's just make an issue explaining this case (For ESQL at least), and continue with this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the code was centralized, I guess the pipeline uses (and already used) the same logic right?

Exactly!

So technically this is existing behavior, and fixing would be a breaking change

I never get tired of quoting Hyrums Law 🙂

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #136750

String valueWrap;
if (traceMatchKey != null) {
valueWrap = "(?<" + traceMatchKey + "." + i + ">" + pattern + ")";
} else {
valueWrap = "(?:" + patterns.get(i) + ")";
}
if (combinedPattern.isEmpty()) {
combinedPattern = valueWrap;
} else {
combinedPattern = combinedPattern + "|" + valueWrap;
}
}
} else {
combinedPattern = patterns.getFirst();
}

return combinedPattern;
}
}
19 changes: 19 additions & 0 deletions libs/grok/src/test/java/org/elasticsearch/grok/GrokTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,25 @@ private void testLogCallBack(boolean ecsCompatibility) {
assertThat(message.get(), containsString("regular expression has redundant nested repeat operator"));
}

public void testCombinedPatterns() {
String matchKey = "_ingest._grok_match_index";
String combined;
combined = Grok.combinePatterns(List.of(""), null);
assertThat(combined, equalTo(""));
combined = Grok.combinePatterns(List.of(""), matchKey);
assertThat(combined, equalTo(""));
combined = Grok.combinePatterns(List.of("foo"), null);
assertThat(combined, equalTo("foo"));
combined = Grok.combinePatterns(List.of("foo"), matchKey);
assertThat(combined, equalTo("foo"));
combined = Grok.combinePatterns(List.of("foo", "bar"), null);
assertThat(combined, equalTo("(?:foo)|(?:bar)"));
combined = Grok.combinePatterns(List.of("foo", "bar"));
assertThat(combined, equalTo("(?:foo)|(?:bar)"));
combined = Grok.combinePatterns(List.of("foo", "bar"), matchKey);
assertThat(combined, equalTo("(?<_ingest._grok_match_index.0>foo)|(?<_ingest._grok_match_index.1>bar)"));
}

private void assertGrokedField(String fieldName) {
String line = "foo";
// test both with and without ECS compatibility
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,15 @@ public final class GrokProcessor extends AbstractProcessor {
MatcherWatchdog matcherWatchdog
) {
super(tag, description);
String combinedPattern = Grok.combinePatterns(matchPatterns, traceMatch ? PATTERN_MATCH_KEY : null);
this.matchField = matchField;
this.matchPatterns = matchPatterns;
this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch), matcherWatchdog, logger::debug);
this.grok = new Grok(patternBank, combinedPattern, matcherWatchdog, logger::debug);
this.traceMatch = traceMatch;
this.ignoreMissing = ignoreMissing;
// Joni warnings are only emitted on an attempt to match, and the warning emitted for every call to match which is too verbose
// so here we emit a warning (if there is one) to the logfile at warn level on construction / processor creation.
new Grok(patternBank, combinePatterns(matchPatterns, traceMatch), matcherWatchdog, logger::warn).match("___nomatch___");
new Grok(patternBank, combinedPattern, matcherWatchdog, logger::warn).match("___nomatch___");
}

@Override
Expand Down Expand Up @@ -113,31 +114,6 @@ List<String> getMatchPatterns() {
return matchPatterns;
}

static String combinePatterns(List<String> patterns, boolean traceMatch) {
String combinedPattern;
if (patterns.size() > 1) {
combinedPattern = "";
for (int i = 0; i < patterns.size(); i++) {
String pattern = patterns.get(i);
String valueWrap;
if (traceMatch) {
valueWrap = "(?<" + PATTERN_MATCH_KEY + "." + i + ">" + pattern + ")";
} else {
valueWrap = "(?:" + patterns.get(i) + ")";
}
if (combinedPattern.equals("")) {
combinedPattern = valueWrap;
} else {
combinedPattern = combinedPattern + "|" + valueWrap;
}
}
} else {
combinedPattern = patterns.get(0);
}

return combinedPattern;
}

public static final class Factory implements Processor.Factory {

private final MatcherWatchdog matcherWatchdog;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,22 +296,6 @@ public void testTraceWithOnePattern() throws Exception {
assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("0"));
}

public void testCombinedPatterns() {
String combined;
combined = GrokProcessor.combinePatterns(List.of(""), false);
assertThat(combined, equalTo(""));
combined = GrokProcessor.combinePatterns(List.of(""), true);
assertThat(combined, equalTo(""));
combined = GrokProcessor.combinePatterns(List.of("foo"), false);
assertThat(combined, equalTo("foo"));
combined = GrokProcessor.combinePatterns(List.of("foo"), true);
assertThat(combined, equalTo("foo"));
combined = GrokProcessor.combinePatterns(List.of("foo", "bar"), false);
assertThat(combined, equalTo("(?:foo)|(?:bar)"));
combined = GrokProcessor.combinePatterns(List.of("foo", "bar"), true);
assertThat(combined, equalTo("(?<_ingest._grok_match_index.0>foo)|(?<_ingest._grok_match_index.1>bar)"));
}

public void testCombineSamePatternNameAcrossPatterns() throws Exception {
String fieldName = RandomDocumentPicks.randomFieldName(random());
IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,13 @@ emp_no:integer | a:keyword | b:keyword
10005 | null | null | null | null
;

multiPatterns
row text = "XYZ extractme" | grok text "ABC %{WORD:foo}","XYZ %{WORD:bar}" | keep foo, bar;

foo:keyword | bar:keyword
null | extractme
;

overwriteInputName
required_capability: grok_dissect_masking
row text = "123 abc", int = 5 | sort int asc | grok text "%{NUMBER:text:int} %{WORD:description}" | keep text, int, description;
Expand Down
2 changes: 1 addition & 1 deletion x-pack/plugin/esql/src/main/antlr/EsqlBaseParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ commandNamedParameters
;

grokCommand
: GROK primaryExpression string
: GROK primaryExpression string (COMMA string)*
;

mvExpandCommand
Expand Down

Large diffs are not rendered by default.

Loading