Skip to content

Commit 104f7f0

Browse files
authored
[Backport 2.19-dev] Support Regex for replace eval function (#4456) (#4592)
1 parent 6b8f420 commit 104f7f0

File tree

4 files changed

+256
-3
lines changed

4 files changed

+256
-3
lines changed

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,12 +246,15 @@
246246
import java.util.StringJoiner;
247247
import java.util.concurrent.ConcurrentHashMap;
248248
import java.util.function.BiFunction;
249+
import java.util.regex.Pattern;
250+
import java.util.regex.PatternSyntaxException;
249251
import java.util.stream.Collectors;
250252
import java.util.stream.Stream;
251253
import javax.annotation.Nullable;
252254
import org.apache.calcite.rel.type.RelDataType;
253255
import org.apache.calcite.rex.RexBuilder;
254256
import org.apache.calcite.rex.RexLambda;
257+
import org.apache.calcite.rex.RexLiteral;
255258
import org.apache.calcite.rex.RexNode;
256259
import org.apache.calcite.sql.SqlAggFunction;
257260
import org.apache.calcite.sql.SqlOperator;
@@ -695,7 +698,49 @@ void populate() {
695698
registerOperator(LOWER, SqlStdOperatorTable.LOWER);
696699
registerOperator(POSITION, SqlStdOperatorTable.POSITION);
697700
registerOperator(LOCATE, SqlStdOperatorTable.POSITION);
698-
registerOperator(REPLACE, SqlStdOperatorTable.REPLACE);
701+
// Register REPLACE with automatic PCRE-to-Java backreference conversion
702+
register(
703+
REPLACE,
704+
(RexBuilder builder, RexNode... args) -> {
705+
// Validate regex pattern at query planning time
706+
if (args.length >= 2 && args[1] instanceof RexLiteral) {
707+
RexLiteral patternLiteral = (RexLiteral) args[1];
708+
String pattern = patternLiteral.getValueAs(String.class);
709+
if (pattern != null) {
710+
try {
711+
// Compile pattern to validate it - this will throw PatternSyntaxException if
712+
// invalid
713+
Pattern.compile(pattern);
714+
} catch (PatternSyntaxException e) {
715+
// Convert to IllegalArgumentException so it's treated as a client error (400)
716+
throw new IllegalArgumentException(
717+
String.format("Invalid regex pattern '%s': %s", pattern, e.getDescription()),
718+
e);
719+
}
720+
}
721+
}
722+
723+
if (args.length == 3 && args[2] instanceof RexLiteral) {
724+
RexLiteral literal = (RexLiteral) args[2];
725+
String replacement = literal.getValueAs(String.class);
726+
if (replacement != null) {
727+
// Convert PCRE/sed backreferences (\1, \2) to Java style ($1, $2)
728+
String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1");
729+
if (!javaReplacement.equals(replacement)) {
730+
RexNode convertedLiteral =
731+
builder.makeLiteral(
732+
javaReplacement,
733+
literal.getType(),
734+
literal.getTypeName() != SqlTypeName.CHAR);
735+
return builder.makeCall(
736+
SqlLibraryOperators.REGEXP_REPLACE_3, args[0], args[1], convertedLiteral);
737+
}
738+
}
739+
}
740+
return builder.makeCall(SqlLibraryOperators.REGEXP_REPLACE_3, args);
741+
},
742+
wrapSqlOperandTypeChecker(
743+
SqlLibraryOperators.REGEXP_REPLACE_3.getOperandTypeChecker(), REPLACE.name(), false));
699744
registerOperator(UPPER, SqlStdOperatorTable.UPPER);
700745
registerOperator(ABS, SqlStdOperatorTable.ABS);
701746
registerOperator(ACOS, SqlStdOperatorTable.ACOS);

docs/user/ppl/functions/string.rst

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,9 +207,15 @@ REPLACE
207207
Description
208208
>>>>>>>>>>>
209209

210-
Usage: replace(str, substr, newstr) returns a string with all occurrences of substr replaced by newstr in str. If any argument is NULL, the function returns NULL.
210+
Usage: replace(str, pattern, replacement) returns a string with all occurrences of the pattern replaced by the replacement string in str. If any argument is NULL, the function returns NULL.
211211

212-
Example::
212+
**Regular Expression Support**: The pattern argument supports Java regex syntax, including:
213+
214+
Argument type: STRING, STRING (regex pattern), STRING (replacement)
215+
216+
Return type: STRING
217+
218+
Literal String Replacement Examples::
213219

214220
os> source=people | eval `REPLACE('helloworld', 'world', 'universe')` = REPLACE('helloworld', 'world', 'universe'), `REPLACE('helloworld', 'invalid', 'universe')` = REPLACE('helloworld', 'invalid', 'universe') | fields `REPLACE('helloworld', 'world', 'universe')`, `REPLACE('helloworld', 'invalid', 'universe')`
215221
fetched rows / total rows = 1/1
@@ -219,6 +225,41 @@ Example::
219225
| hellouniverse | helloworld |
220226
+--------------------------------------------+----------------------------------------------+
221227

228+
Regex Pattern Examples::
229+
230+
os> source=people | eval `Remove digits` = REPLACE('test123', '\d+', ''), `Collapse spaces` = REPLACE('hello world', ' +', ' '), `Remove special` = REPLACE('hello@world!', '[^a-zA-Z]', '') | fields `Remove digits`, `Collapse spaces`, `Remove special`
231+
fetched rows / total rows = 1/1
232+
+---------------+-----------------+----------------+
233+
| Remove digits | Collapse spaces | Remove special |
234+
|---------------+-----------------+----------------|
235+
| test | hello world | helloworld |
236+
+---------------+-----------------+----------------+
237+
238+
Capture Group and Backreference Examples::
239+
240+
os> source=people | eval `Swap date` = REPLACE('1/14/2023', '^(\d{1,2})/(\d{1,2})/', '$2/$1/'), `Reverse words` = REPLACE('Hello World', '(\w+) (\w+)', '$2 $1'), `Extract domain` = REPLACE('user@example.com', '.*@(.+)', '$1') | fields `Swap date`, `Reverse words`, `Extract domain`
241+
fetched rows / total rows = 1/1
242+
+-----------+---------------+----------------+
243+
| Swap date | Reverse words | Extract domain |
244+
|-----------+---------------+----------------|
245+
| 14/1/2023 | World Hello | example.com |
246+
+-----------+---------------+----------------+
247+
248+
Advanced Regex Examples::
249+
250+
os> source=people | eval `Clean phone` = REPLACE('(555) 123-4567', '[^0-9]', ''), `Remove vowels` = REPLACE('hello world', '[aeiou]', ''), `Add prefix` = REPLACE('test', '^', 'pre_') | fields `Clean phone`, `Remove vowels`, `Add prefix`
251+
fetched rows / total rows = 1/1
252+
+-------------+---------------+------------+
253+
| Clean phone | Remove vowels | Add prefix |
254+
|-------------+---------------+------------|
255+
| 5551234567 | hll wrld | pre_test |
256+
+-------------+---------------+------------+
257+
258+
**Note**: When using regex patterns in PPL queries:
259+
260+
* Backslashes must be escaped (use ``\\`` instead of ``\``) - e.g., ``\\d`` for digit pattern, ``\\w+`` for word characters
261+
* Backreferences support both PCRE-style (``\1``, ``\2``, etc.) and Java-style (``$1``, ``$2``, etc.) syntax. PCRE-style backreferences are automatically converted to Java-style internally.
262+
222263

223264
REVERSE
224265
-------

integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
package org.opensearch.sql.calcite.remote;
77

8+
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_ACCOUNT;
89
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_STATE_COUNTRY;
910
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_STATE_COUNTRY_WITH_NULL;
1011
import static org.opensearch.sql.util.MatcherUtils.*;
@@ -24,6 +25,7 @@ public void init() throws Exception {
2425

2526
loadIndex(Index.STATE_COUNTRY);
2627
loadIndex(Index.STATE_COUNTRY_WITH_NULL);
28+
loadIndex(Index.ACCOUNT);
2729
}
2830

2931
@Test
@@ -300,6 +302,77 @@ public void testReplace() throws IOException {
300302
verifyDataRows(actual, rows("Jane", 20, "heLLo"));
301303
}
302304

305+
@Test
306+
public void testReplaceWithRegexPattern() throws IOException {
307+
JSONObject actual =
308+
executeQuery(
309+
String.format(
310+
"source=%s | where account_number = 1 | eval street_only = replace(address,"
311+
+ " '\\\\d+ ', '') | fields address, street_only",
312+
TEST_INDEX_ACCOUNT));
313+
314+
verifySchema(actual, schema("address", "string"), schema("street_only", "string"));
315+
316+
verifyDataRows(actual, rows("880 Holmes Lane", "Holmes Lane"));
317+
}
318+
319+
@Test
320+
public void testReplaceWithCaptureGroups() throws IOException {
321+
JSONObject actual =
322+
executeQuery(
323+
String.format(
324+
"source=%s | where account_number = 1 | eval swapped = replace(firstname,"
325+
+ " '^(.)(.)', '\\\\2\\\\1') | fields firstname, swapped",
326+
TEST_INDEX_ACCOUNT));
327+
328+
verifySchema(actual, schema("firstname", "string"), schema("swapped", "string"));
329+
330+
verifyDataRows(actual, rows("Amber", "mAber"));
331+
}
332+
333+
@Test
334+
public void testReplaceWithEmailDomainReplacement() throws IOException {
335+
JSONObject actual =
336+
executeQuery(
337+
String.format(
338+
"source=%s | where account_number = 1 | eval new_email ="
339+
+ " replace(email, '([^@]+)@(.+)', '\\\\1@newdomain.com') | fields email,"
340+
+ " new_email",
341+
TEST_INDEX_ACCOUNT));
342+
343+
verifySchema(actual, schema("email", "string"), schema("new_email", "string"));
344+
345+
verifyDataRows(actual, rows("amberduke@pyrami.com", "amberduke@newdomain.com"));
346+
}
347+
348+
@Test
349+
public void testReplaceWithCharacterClasses() throws IOException {
350+
JSONObject actual =
351+
executeQuery(
352+
String.format(
353+
"source=%s | where account_number = 1 | eval masked = replace(address, '[a-zA-Z]',"
354+
+ " 'X') | fields address, masked",
355+
TEST_INDEX_ACCOUNT));
356+
357+
verifySchema(actual, schema("address", "string"), schema("masked", "string"));
358+
359+
verifyDataRows(actual, rows("880 Holmes Lane", "880 XXXXXX XXXX"));
360+
}
361+
362+
@Test
363+
public void testReplaceWithAnchors() throws IOException {
364+
JSONObject actual =
365+
executeQuery(
366+
String.format(
367+
"source=%s | where account_number = 1 | eval street_name = replace(address,"
368+
+ " '^\\\\d+\\\\s+', '') | fields address, street_name",
369+
TEST_INDEX_ACCOUNT));
370+
371+
verifySchema(actual, schema("address", "string"), schema("street_name", "string"));
372+
373+
verifyDataRows(actual, rows("880 Holmes Lane", "Holmes Lane"));
374+
}
375+
303376
@Test
304377
public void testLeft() throws IOException {
305378
JSONObject actual =
@@ -326,6 +399,51 @@ public void testStrCmp() throws IOException {
326399
verifyDataRows(actual, rows("Jane", 20));
327400
}
328401

402+
@Test
403+
public void testReplaceWithInvalidRegexPattern() {
404+
// Test invalid regex pattern - unclosed character class
405+
Throwable e1 =
406+
assertThrowsWithReplace(
407+
Exception.class,
408+
() ->
409+
executeQuery(
410+
String.format(
411+
"source=%s | eval result = replace(firstname, '[unclosed', 'X') | fields"
412+
+ " firstname, result",
413+
TEST_INDEX_ACCOUNT)));
414+
verifyErrorMessageContains(e1, "Invalid regex pattern");
415+
verifyErrorMessageContains(e1, "Unclosed character class");
416+
verifyErrorMessageContains(e1, "400 Bad Request");
417+
418+
// Test invalid regex pattern - unclosed group
419+
Throwable e2 =
420+
assertThrowsWithReplace(
421+
Exception.class,
422+
() ->
423+
executeQuery(
424+
String.format(
425+
"source=%s | eval result = replace(firstname, '(invalid', 'X') | fields"
426+
+ " firstname, result",
427+
TEST_INDEX_ACCOUNT)));
428+
verifyErrorMessageContains(e2, "Invalid regex pattern");
429+
verifyErrorMessageContains(e2, "Unclosed group");
430+
verifyErrorMessageContains(e2, "400 Bad Request");
431+
432+
// Test invalid regex pattern - dangling metacharacter
433+
Throwable e3 =
434+
assertThrowsWithReplace(
435+
Exception.class,
436+
() ->
437+
executeQuery(
438+
String.format(
439+
"source=%s | eval result = replace(firstname, '?invalid', 'X') | fields"
440+
+ " firstname, result",
441+
TEST_INDEX_ACCOUNT)));
442+
verifyErrorMessageContains(e3, "Invalid regex pattern");
443+
verifyErrorMessageContains(e3, "Dangling meta character");
444+
verifyErrorMessageContains(e3, "400 Bad Request");
445+
}
446+
329447
private void prepareTrim() throws IOException {
330448
Request request1 =
331449
new Request("PUT", "/opensearch-sql_test_index_state_country/_doc/5?refresh=true");

ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,4 +263,53 @@ public void testRegexMatchWithStats() {
263263
+ "WHERE REGEXP_CONTAINS(`JOB`, 'MAN')";
264264
verifyPPLToSparkSQL(root, expectedSparkSql);
265265
}
266+
267+
@Test
268+
public void testReplaceLiteralString() {
269+
// Test basic literal string replacement - replaces all 'A' with 'X'
270+
String ppl = "source=EMP | eval new_name = replace(ENAME, 'A', 'X') | fields ENAME, new_name";
271+
RelNode root = getRelNode(ppl);
272+
String expectedLogical =
273+
"LogicalProject(ENAME=[$1], new_name=[REGEXP_REPLACE($1, 'A', 'X')])\n"
274+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
275+
verifyLogical(root, expectedLogical);
276+
277+
String expectedSparkSql =
278+
"SELECT `ENAME`, REGEXP_REPLACE(`ENAME`, 'A', 'X') `new_name`\n" + "FROM `scott`.`EMP`";
279+
verifyPPLToSparkSQL(root, expectedSparkSql);
280+
}
281+
282+
@Test
283+
public void testReplaceWithRegexPattern() {
284+
// Test regex pattern - remove all digits
285+
String ppl = "source=EMP | eval no_digits = replace(JOB, '\\\\d+', '') | fields JOB, no_digits";
286+
RelNode root = getRelNode(ppl);
287+
String expectedLogical =
288+
"LogicalProject(JOB=[$2], no_digits=[REGEXP_REPLACE($2, '\\\\d+':VARCHAR, '':VARCHAR)])\n"
289+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
290+
verifyLogical(root, expectedLogical);
291+
292+
String expectedSparkSql =
293+
"SELECT `JOB`, REGEXP_REPLACE(`JOB`, '\\\\d+', '') `no_digits`\n" + "FROM `scott`.`EMP`";
294+
verifyPPLToSparkSQL(root, expectedSparkSql);
295+
}
296+
297+
@Test
298+
public void testReplaceWithRegexCaptureGroups() {
299+
// Test regex with capture groups - swap first two characters using \1 and \2 backreferences
300+
String ppl =
301+
"source=EMP | eval swapped = replace(ENAME, '^(.)(.)', '\\\\2\\\\1') | fields ENAME,"
302+
+ " swapped";
303+
RelNode root = getRelNode(ppl);
304+
String expectedLogical =
305+
"LogicalProject(ENAME=[$1], swapped=[REGEXP_REPLACE($1, '^(.)(.)':VARCHAR,"
306+
+ " '\\$2\\$1')])\n"
307+
+ " LogicalTableScan(table=[[scott, EMP]])\n";
308+
verifyLogical(root, expectedLogical);
309+
310+
String expectedSparkSql =
311+
"SELECT `ENAME`, REGEXP_REPLACE(`ENAME`, '^(.)(.)', '\\$2\\$1') `swapped`\n"
312+
+ "FROM `scott`.`EMP`";
313+
verifyPPLToSparkSQL(root, expectedSparkSql);
314+
}
266315
}

0 commit comments

Comments
 (0)