Skip to content

Commit ab02d56

Browse files
authored
[Feature] Implementation of mode sed and offset_field in rex PPL command (#4241)
* [Feature] Implementation of mode sed and offset_field in rex PPL command Signed-off-by: Jialiang Liang <[email protected]> * update rex rst doc Signed-off-by: Jialiang Liang <[email protected]> * chen - address comment and merge grammar in parser Signed-off-by: Jialiang Liang <[email protected]> * chen - limit offset field only in extraction mode Signed-off-by: Jialiang Liang <[email protected]> * chen - specify exception type of o_f UDF Signed-off-by: Jialiang Liang <[email protected]> * chen - add exception type of o_f UDF - 2 Signed-off-by: Jialiang Liang <[email protected]> * chen - add exception type of o_f UDF - also fix the test Signed-off-by: Jialiang Liang <[email protected]> * chen - alphabetical order of o_f return Signed-off-by: Jialiang Liang <[email protected]> --------- Signed-off-by: Jialiang Liang <[email protected]>
1 parent fa3a12a commit ab02d56

File tree

15 files changed

+507
-12
lines changed

15 files changed

+507
-12
lines changed

core/src/main/java/org/opensearch/sql/ast/tree/Rex.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,38 +23,52 @@
2323
public class Rex extends UnresolvedPlan {
2424

2525
public enum RexMode {
26-
EXTRACT
26+
EXTRACT,
27+
SED
2728
}
2829

2930
/** Field to extract from. */
3031
private final UnresolvedExpression field;
3132

32-
/** Pattern with named capture groups. */
33+
/** Pattern with named capture groups or sed expression. */
3334
private final Literal pattern;
3435

35-
/** Rex mode (only EXTRACT supported). */
36+
/** Rex mode (extract or sed). */
3637
private final RexMode mode;
3738

3839
/** Maximum number of matches (optional). */
3940
private final Optional<Integer> maxMatch;
4041

42+
/** Offset field name for position tracking (optional). */
43+
private final Optional<String> offsetField;
44+
4145
/** Child Plan. */
4246
@Setter private UnresolvedPlan child;
4347

4448
public Rex(UnresolvedExpression field, Literal pattern) {
45-
this(field, pattern, RexMode.EXTRACT, Optional.empty());
49+
this(field, pattern, RexMode.EXTRACT, Optional.empty(), Optional.empty());
4650
}
4751

4852
public Rex(UnresolvedExpression field, Literal pattern, Optional<Integer> maxMatch) {
49-
this(field, pattern, RexMode.EXTRACT, maxMatch);
53+
this(field, pattern, RexMode.EXTRACT, maxMatch, Optional.empty());
5054
}
5155

5256
public Rex(
5357
UnresolvedExpression field, Literal pattern, RexMode mode, Optional<Integer> maxMatch) {
58+
this(field, pattern, mode, maxMatch, Optional.empty());
59+
}
60+
61+
public Rex(
62+
UnresolvedExpression field,
63+
Literal pattern,
64+
RexMode mode,
65+
Optional<Integer> maxMatch,
66+
Optional<String> offsetField) {
5467
this.field = field;
5568
this.pattern = pattern;
5669
this.mode = mode;
5770
this.maxMatch = maxMatch;
71+
this.offsetField = offsetField;
5872
}
5973

6074
@Override

core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,13 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
218218
RexNode fieldRex = rexVisitor.analyze(node.getField(), context);
219219
String patternStr = (String) node.getPattern().getValue();
220220

221+
if (node.getMode() == Rex.RexMode.SED) {
222+
RexNode sedCall = createOptimizedSedCall(fieldRex, patternStr, context);
223+
String fieldName = node.getField().toString();
224+
projectPlusOverriding(List.of(sedCall), List.of(fieldName), context);
225+
return context.relBuilder.peek();
226+
}
227+
221228
List<String> namedGroups = RegexCommonUtils.getNamedGroupCandidates(patternStr);
222229

223230
if (namedGroups.isEmpty()) {
@@ -252,6 +259,17 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
252259
newFieldNames.add(namedGroups.get(i));
253260
}
254261

262+
if (node.getOffsetField().isPresent()) {
263+
RexNode offsetCall =
264+
PPLFuncImpTable.INSTANCE.resolve(
265+
context.rexBuilder,
266+
BuiltinFunctionName.REX_OFFSET,
267+
fieldRex,
268+
context.rexBuilder.makeLiteral(patternStr));
269+
newFields.add(offsetCall);
270+
newFieldNames.add(node.getOffsetField().get());
271+
}
272+
255273
projectPlusOverriding(newFields, newFieldNames, context);
256274
return context.relBuilder.peek();
257275
}
@@ -2253,4 +2271,115 @@ private void buildExpandRelNode(
22532271
context.relBuilder.rename(names);
22542272
}
22552273
}
2274+
2275+
/** Creates an optimized sed call using native Calcite functions */
2276+
private RexNode createOptimizedSedCall(
2277+
RexNode fieldRex, String sedExpression, CalcitePlanContext context) {
2278+
if (sedExpression.startsWith("s/")) {
2279+
return createOptimizedSubstitution(fieldRex, sedExpression, context);
2280+
} else if (sedExpression.startsWith("y/")) {
2281+
return createOptimizedTransliteration(fieldRex, sedExpression, context);
2282+
} else {
2283+
throw new RuntimeException("Unsupported sed pattern: " + sedExpression);
2284+
}
2285+
}
2286+
2287+
/** Creates optimized substitution calls for s/pattern/replacement/flags syntax. */
2288+
private RexNode createOptimizedSubstitution(
2289+
RexNode fieldRex, String sedExpression, CalcitePlanContext context) {
2290+
try {
2291+
// Parse sed substitution: s/pattern/replacement/flags
2292+
if (!sedExpression.matches("s/.+/.*/.*")) {
2293+
throw new IllegalArgumentException("Invalid sed substitution format");
2294+
}
2295+
2296+
// Find the delimiters - sed format is s/pattern/replacement/flags
2297+
int firstDelimiter = sedExpression.indexOf('/', 2); // First '/' after 's/'
2298+
int secondDelimiter = sedExpression.indexOf('/', firstDelimiter + 1); // Second '/'
2299+
int thirdDelimiter = sedExpression.indexOf('/', secondDelimiter + 1); // Third '/' (optional)
2300+
2301+
if (firstDelimiter == -1 || secondDelimiter == -1) {
2302+
throw new IllegalArgumentException("Invalid sed substitution format");
2303+
}
2304+
2305+
String pattern = sedExpression.substring(2, firstDelimiter);
2306+
String replacement = sedExpression.substring(firstDelimiter + 1, secondDelimiter);
2307+
String flags =
2308+
secondDelimiter + 1 < sedExpression.length()
2309+
? sedExpression.substring(secondDelimiter + 1)
2310+
: "";
2311+
2312+
// Convert sed backreferences (\1, \2) to Java style ($1, $2)
2313+
String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1");
2314+
2315+
if (flags.isEmpty()) {
2316+
// 3-parameter REGEXP_REPLACE
2317+
return PPLFuncImpTable.INSTANCE.resolve(
2318+
context.rexBuilder,
2319+
BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_3,
2320+
fieldRex,
2321+
context.rexBuilder.makeLiteral(pattern),
2322+
context.rexBuilder.makeLiteral(javaReplacement));
2323+
} else if (flags.matches("[gi]+")) {
2324+
// 4-parameter REGEXP_REPLACE with flags
2325+
return PPLFuncImpTable.INSTANCE.resolve(
2326+
context.rexBuilder,
2327+
BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_PG_4,
2328+
fieldRex,
2329+
context.rexBuilder.makeLiteral(pattern),
2330+
context.rexBuilder.makeLiteral(javaReplacement),
2331+
context.rexBuilder.makeLiteral(flags));
2332+
} else if (flags.matches("\\d+")) {
2333+
// 5-parameter REGEXP_REPLACE with occurrence
2334+
int occurrence = Integer.parseInt(flags);
2335+
return PPLFuncImpTable.INSTANCE.resolve(
2336+
context.rexBuilder,
2337+
BuiltinFunctionName.INTERNAL_REGEXP_REPLACE_5,
2338+
fieldRex,
2339+
context.rexBuilder.makeLiteral(pattern),
2340+
context.rexBuilder.makeLiteral(javaReplacement),
2341+
context.relBuilder.literal(1), // start position
2342+
context.relBuilder.literal(occurrence));
2343+
} else {
2344+
throw new RuntimeException(
2345+
"Unsupported sed flags: " + flags + " in expression: " + sedExpression);
2346+
}
2347+
} catch (Exception e) {
2348+
throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e);
2349+
}
2350+
}
2351+
2352+
/** Creates optimized transliteration calls for y/from/to/ syntax. */
2353+
private RexNode createOptimizedTransliteration(
2354+
RexNode fieldRex, String sedExpression, CalcitePlanContext context) {
2355+
try {
2356+
// Parse sed transliteration: y/from/to/
2357+
if (!sedExpression.matches("y/.+/.*/.*")) {
2358+
throw new IllegalArgumentException("Invalid sed transliteration format");
2359+
}
2360+
2361+
int firstSlash = sedExpression.indexOf('/', 1);
2362+
int secondSlash = sedExpression.indexOf('/', firstSlash + 1);
2363+
int thirdSlash = sedExpression.indexOf('/', secondSlash + 1);
2364+
2365+
if (firstSlash == -1 || secondSlash == -1) {
2366+
throw new IllegalArgumentException("Invalid sed transliteration format");
2367+
}
2368+
2369+
String from = sedExpression.substring(firstSlash + 1, secondSlash);
2370+
String to =
2371+
sedExpression.substring(
2372+
secondSlash + 1, thirdSlash != -1 ? thirdSlash : sedExpression.length());
2373+
2374+
// Use Calcite's native TRANSLATE3 function
2375+
return PPLFuncImpTable.INSTANCE.resolve(
2376+
context.rexBuilder,
2377+
BuiltinFunctionName.INTERNAL_TRANSLATE3,
2378+
fieldRex,
2379+
context.rexBuilder.makeLiteral(from),
2380+
context.rexBuilder.makeLiteral(to));
2381+
} catch (Exception e) {
2382+
throw new RuntimeException("Failed to optimize sed expression: " + sedExpression, e);
2383+
}
2384+
}
22562385
}

core/src/main/java/org/opensearch/sql/expression/function/BuiltinFunctionName.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ public enum BuiltinFunctionName {
225225
REGEX_MATCH(FunctionName.of("regex_match")),
226226
REX_EXTRACT(FunctionName.of("REX_EXTRACT")),
227227
REX_EXTRACT_MULTI(FunctionName.of("REX_EXTRACT_MULTI")),
228+
REX_OFFSET(FunctionName.of("REX_OFFSET")),
228229
REPLACE(FunctionName.of("replace")),
229230
REVERSE(FunctionName.of("reverse")),
230231
RIGHT(FunctionName.of("right")),

core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
import org.opensearch.sql.expression.function.udf.RelevanceQueryFunction;
6161
import org.opensearch.sql.expression.function.udf.RexExtractFunction;
6262
import org.opensearch.sql.expression.function.udf.RexExtractMultiFunction;
63+
import org.opensearch.sql.expression.function.udf.RexOffsetFunction;
6364
import org.opensearch.sql.expression.function.udf.SpanFunction;
6465
import org.opensearch.sql.expression.function.udf.condition.EarliestFunction;
6566
import org.opensearch.sql.expression.function.udf.condition.EnhancedCoalesceFunction;
@@ -408,6 +409,7 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable {
408409
public static final SqlOperator REX_EXTRACT = new RexExtractFunction().toUDF("REX_EXTRACT");
409410
public static final SqlOperator REX_EXTRACT_MULTI =
410411
new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI");
412+
public static final SqlOperator REX_OFFSET = new RexOffsetFunction().toUDF("REX_OFFSET");
411413

412414
// Aggregation functions
413415
public static final SqlAggFunction AVG_NULLABLE = new NullableSqlAvgAggFunction(SqlKind.AVG);

core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@
167167
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REVERSE;
168168
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT;
169169
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_EXTRACT_MULTI;
170+
import static org.opensearch.sql.expression.function.BuiltinFunctionName.REX_OFFSET;
170171
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RIGHT;
171172
import static org.opensearch.sql.expression.function.BuiltinFunctionName.RINT;
172173
import static org.opensearch.sql.expression.function.BuiltinFunctionName.ROUND;
@@ -717,6 +718,7 @@ void populate() {
717718
registerOperator(MULTI_MATCH, PPLBuiltinOperators.MULTI_MATCH);
718719
registerOperator(REX_EXTRACT, PPLBuiltinOperators.REX_EXTRACT);
719720
registerOperator(REX_EXTRACT_MULTI, PPLBuiltinOperators.REX_EXTRACT_MULTI);
721+
registerOperator(REX_OFFSET, PPLBuiltinOperators.REX_OFFSET);
720722

721723
// Register PPL Datetime UDF operator
722724
registerOperator(TIMESTAMP, PPLBuiltinOperators.TIMESTAMP);
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.expression.function.udf;
7+
8+
import java.util.List;
9+
import java.util.regex.Matcher;
10+
import java.util.regex.Pattern;
11+
import java.util.regex.PatternSyntaxException;
12+
import org.apache.calcite.adapter.enumerable.NotNullImplementor;
13+
import org.apache.calcite.adapter.enumerable.NullPolicy;
14+
import org.apache.calcite.adapter.enumerable.RexToLixTranslator;
15+
import org.apache.calcite.linq4j.tree.Expression;
16+
import org.apache.calcite.linq4j.tree.Expressions;
17+
import org.apache.calcite.rex.RexCall;
18+
import org.apache.calcite.sql.type.ReturnTypes;
19+
import org.apache.calcite.sql.type.SqlReturnTypeInference;
20+
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
21+
import org.opensearch.sql.expression.function.ImplementorUDF;
22+
import org.opensearch.sql.expression.function.UDFOperandMetadata;
23+
24+
/** Custom REX_OFFSET function for calculating regex match positions. */
25+
public final class RexOffsetFunction extends ImplementorUDF {
26+
27+
public RexOffsetFunction() {
28+
super(new RexOffsetImplementor(), NullPolicy.ARG0);
29+
}
30+
31+
@Override
32+
public SqlReturnTypeInference getReturnTypeInference() {
33+
return ReturnTypes.VARCHAR_2000_NULLABLE;
34+
}
35+
36+
@Override
37+
public UDFOperandMetadata getOperandMetadata() {
38+
return PPLOperandTypes.STRING_STRING;
39+
}
40+
41+
private static class RexOffsetImplementor implements NotNullImplementor {
42+
43+
@Override
44+
public Expression implement(
45+
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
46+
Expression field = translatedOperands.get(0);
47+
Expression pattern = translatedOperands.get(1);
48+
49+
return Expressions.call(RexOffsetFunction.class, "calculateOffsets", field, pattern);
50+
}
51+
}
52+
53+
public static String calculateOffsets(String text, String patternStr) {
54+
if (text == null || patternStr == null) {
55+
return null;
56+
}
57+
58+
try {
59+
Pattern pattern = Pattern.compile(patternStr);
60+
Matcher matcher = pattern.matcher(text);
61+
62+
if (!matcher.find()) {
63+
return null;
64+
}
65+
66+
List<String> offsetPairs = new java.util.ArrayList<>();
67+
68+
Pattern namedGroupPattern = Pattern.compile("\\(\\?<([^>]+)>");
69+
Matcher namedGroupMatcher = namedGroupPattern.matcher(patternStr);
70+
71+
int groupIndex = 1;
72+
73+
while (namedGroupMatcher.find()) {
74+
String groupName = namedGroupMatcher.group(1);
75+
76+
if (groupIndex <= matcher.groupCount()) {
77+
int start = matcher.start(groupIndex);
78+
int end = matcher.end(groupIndex);
79+
80+
if (start >= 0 && end >= 0) {
81+
offsetPairs.add(groupName + "=" + start + "-" + (end - 1));
82+
}
83+
}
84+
groupIndex++;
85+
}
86+
87+
java.util.Collections.sort(offsetPairs);
88+
return offsetPairs.isEmpty() ? null : String.join("&", offsetPairs);
89+
} catch (PatternSyntaxException e) {
90+
throw new IllegalArgumentException(
91+
"Invalid regex pattern in rex command: " + e.getMessage(), e);
92+
}
93+
}
94+
}

0 commit comments

Comments
 (0)