Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
import org.elasticsearch.xpack.esql.expression.function.scalar.math.RoundTo;
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvMin;
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToLower;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToUpper;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.RLike;
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Add;
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.LessThan;
Expand Down
6 changes: 6 additions & 0 deletions docs/changelog/128393.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 128393
summary: Pushdown constructs doing case-insensitive regexes
area: ES|QL
type: enhancement
issues:
- 127479

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ static TransportVersion def(int id) {
public static final TransportVersion ML_INFERENCE_HUGGING_FACE_RERANK_ADDED = def(9_080_0_00);
public static final TransportVersion SETTINGS_IN_DATA_STREAMS_DRY_RUN = def(9_081_0_00);
public static final TransportVersion ML_INFERENCE_SAGEMAKER_CHAT_COMPLETION = def(9_082_0_00);
public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_083_0_00);
/*
* STOP! READ THIS FIRST! No, really,
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ public abstract class AbstractStringPattern implements StringPattern {

private Automaton automaton;

public abstract Automaton createAutomaton();
public abstract Automaton createAutomaton(boolean ignoreCase);

private Automaton automaton() {
if (automaton == null) {
automaton = createAutomaton();
automaton = createAutomaton(false);
}
return automaton;
}
Expand Down
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropped the now useless proxy-class.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ public RLikePattern(String regexpPattern) {
}

@Override
public Automaton createAutomaton() {
public Automaton createAutomaton(boolean ignoreCase) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expose ignoreCase as a property in StringPattern since it affects both the Automaton and javaRegex. The former can contain the mode but the latter doesn't so we need a way to bubble it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pattern is independent of how it's used for matching, casing-wise. The java regex version has it's own mechanism to flag case insensitivity and not sure it'd be trivial, or "safe", or even needed to modify it based on a method parameter.
But even if we updated the StringPattern interface, we'd have to recreate the object if the RegexMatch requires case-insenstive matching, since the StringPattern object is created at parsing time (when it's not known if the matching will be case insensitive or not).
Furthermore, the matchesAll() and exactMatch() methods of AbstractStringPattern also calling automaton() are invariant to casing.
So not sure if we'd need any more changes, but if there's a better solution here, happy to apply it.

int matchFlags = ignoreCase ? RegExp.CASE_INSENSITIVE : 0;
return Operations.determinize(
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT).toAutomaton(),
new RegExp(regexpPattern, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, matchFlags).toAutomaton(),
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
);
}
Expand Down

This file was deleted.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above - make the parameter a class property.

Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.xpack.esql.core.util.StringUtils;

import java.util.Objects;

import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;

/**
* Similar to basic regex, supporting '?' wildcard for single character (same as regex ".")
* and '*' wildcard for multiple characters (same as regex ".*")
Expand All @@ -37,8 +40,14 @@ public String pattern() {
}

@Override
public Automaton createAutomaton() {
return WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
public Automaton createAutomaton(boolean ignoreCase) {
return ignoreCase
? Operations.determinize(
new RegExp(luceneWildcardToRegExp(wildcard), RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT, RegExp.CASE_INSENSITIVE)
.toAutomaton(),
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
)
: WildcardQuery.toAutomaton(new Term(null, wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
package org.elasticsearch.xpack.esql.core.util;

import org.apache.lucene.document.InetAddressPoint;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spell.LevenshteinDistance;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
Expand Down Expand Up @@ -178,6 +179,44 @@ public static String wildcardToJavaPattern(String pattern, char escape) {
return regex.toString();
}

/**
* Translates a Lucene wildcard pattern to a Lucene RegExp one.
* @param wildcard Lucene wildcard pattern
* @return Lucene RegExp pattern
*/
public static String luceneWildcardToRegExp(String wildcard) {
StringBuilder regex = new StringBuilder();

for (int i = 0, wcLen = wildcard.length(); i < wcLen; i++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

char c = wildcard.charAt(i); // this will work chunking through Unicode as long as all values matched are ASCII
switch (c) {
case WildcardQuery.WILDCARD_STRING -> regex.append(".*");
case WildcardQuery.WILDCARD_CHAR -> regex.append(".");
case WildcardQuery.WILDCARD_ESCAPE -> {
if (i + 1 < wcLen) {
// consume the wildcard escaping, consider the next char
char next = wildcard.charAt(i + 1);
i++;
switch (next) {
case WildcardQuery.WILDCARD_STRING, WildcardQuery.WILDCARD_CHAR, WildcardQuery.WILDCARD_ESCAPE ->
// escape `*`, `.`, `\`, since these are special chars in RegExp as well
regex.append("\\");
// default: unnecessary escaping -- just ignore the escaping
}
regex.append(next);
} else {
// "else fallthru, lenient parsing with a trailing \" -- according to WildcardQuery#toAutomaton
regex.append("\\\\");
}
}
case '$', '(', ')', '+', '.', '[', ']', '^', '{', '|', '}' -> regex.append("\\").append(c);
default -> regex.append(c);
}
}

return regex.toString();
}

/**
* Translates a like pattern to a Lucene wildcard.
* This methods pays attention to the custom escape char which gets converted into \ (used by Lucene).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

import org.elasticsearch.test.ESTestCase;

import static org.elasticsearch.xpack.esql.core.util.StringUtils.luceneWildcardToRegExp;
import static org.elasticsearch.xpack.esql.core.util.StringUtils.wildcardToJavaPattern;
import static org.hamcrest.Matchers.is;

public class StringUtilsTests extends ESTestCase {

Expand Down Expand Up @@ -55,4 +57,21 @@ public void testWildcard() {
public void testEscapedEscape() {
assertEquals("^\\\\\\\\$", wildcardToJavaPattern("\\\\\\\\", '\\'));
}

public void testLuceneWildcardToRegExp() {
assertThat(luceneWildcardToRegExp(""), is(""));
assertThat(luceneWildcardToRegExp("*"), is(".*"));
assertThat(luceneWildcardToRegExp("?"), is("."));
assertThat(luceneWildcardToRegExp("\\\\"), is("\\\\"));
assertThat(luceneWildcardToRegExp("foo?bar"), is("foo.bar"));
assertThat(luceneWildcardToRegExp("foo*bar"), is("foo.*bar"));
assertThat(luceneWildcardToRegExp("foo\\\\bar"), is("foo\\\\bar"));
assertThat(luceneWildcardToRegExp("foo*bar?baz"), is("foo.*bar.baz"));
assertThat(luceneWildcardToRegExp("foo\\*bar"), is("foo\\*bar"));
assertThat(luceneWildcardToRegExp("foo\\?bar\\?"), is("foo\\?bar\\?"));
assertThat(luceneWildcardToRegExp("foo\\?bar\\"), is("foo\\?bar\\\\"));
assertThat(luceneWildcardToRegExp("[](){}^$.|+"), is("\\[\\]\\(\\)\\{\\}\\^\\$\\.\\|\\+"));
assertThat(luceneWildcardToRegExp("foo\\\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
assertThat(luceneWildcardToRegExp("foo\uD83D\uDC14bar"), is("foo\uD83D\uDC14bar"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.elasticsearch.xpack.esql.core.type.DataType;
import org.elasticsearch.xpack.esql.core.type.EsField;

import java.util.Locale;
import java.util.regex.Pattern;

import static java.util.Collections.emptyMap;
Expand Down Expand Up @@ -61,4 +62,15 @@ public static FieldAttribute getFieldAttribute(String name, DataType dataType) {
public static String stripThrough(String input) {
return WS_PATTERN.matcher(input).replaceAll(StringUtils.EMPTY);
}

/** Returns the input string, but with parts of it having the letter casing changed. */
public static String randomCasing(String input) {
StringBuilder sb = new StringBuilder(input.length());
for (int i = 0, inputLen = input.length(), step = (int) Math.sqrt(inputLen), chunkEnd; i < inputLen; i += step) {
chunkEnd = Math.min(i + step, inputLen);
var chunk = input.substring(i, chunkEnd);
sb.append(randomBoolean() ? chunk.toLowerCase(Locale.ROOT) : chunk.toUpperCase(Locale.ROOT));
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@
import org.elasticsearch.xpack.esql.core.util.DateUtils;
import org.elasticsearch.xpack.esql.core.util.StringUtils;
import org.elasticsearch.xpack.esql.expression.function.EsqlFunctionRegistry;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.RLike;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.WildcardLike;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.RLike;
import org.elasticsearch.xpack.esql.expression.function.scalar.string.regex.WildcardLike;
import org.elasticsearch.xpack.esql.expression.predicate.Range;
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.Equals;
import org.elasticsearch.xpack.esql.expression.predicate.operator.comparison.GreaterThan;
Expand Down
Loading
Loading