Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public enum BuiltinFunctionName {
MVAPPEND(FunctionName.of("mvappend")),
MVJOIN(FunctionName.of("mvjoin")),
MVINDEX(FunctionName.of("mvindex")),
SPLIT(FunctionName.of("split")),
FORALL(FunctionName.of("forall")),
EXISTS(FunctionName.of("exists")),
FILTER(FunctionName.of("filter")),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.expression.function.CollectionUDF;

import org.apache.calcite.rex.RexBuilder;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.sql.SqlOperator;
import org.apache.calcite.sql.fun.SqlLibraryOperators;
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
import org.opensearch.sql.expression.function.PPLFuncImpTable;

/**
* SPLIT function implementation that splits strings by delimiter.
*
* <p>Usage: split(str, delimiter)
*
* <p>Returns an array of strings split on the delimiter.
*
* <p>Special behavior:
*
* <ul>
* <li>Empty delimiter ("") splits into individual characters
* <li>If delimiter not found, returns array with original string
* <li>Empty string returns empty array
* </ul>
*
* <p>Implementation notes:
*
* <ul>
* <li>Uses Calcite's SPLIT for non-empty delimiters
* <li>Uses custom character splitting for empty delimiter via REGEXP_REPLACE
* </ul>
*/
public class SplitFunctionImp implements PPLFuncImpTable.FunctionImp {

@Override
public RexNode resolve(RexBuilder builder, RexNode... args) {
RexNode str = args[0];
RexNode delimiter = args[1];

// Check if delimiter is empty string
// If empty, split into individual characters using a workaround
// If not empty, use Calcite's SPLIT function

// Create condition: delimiter = ''
RexNode emptyString = builder.makeLiteral("");
RexNode isEmptyDelimiter = builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);

// For empty delimiter: split into characters
// Pattern: Insert a delimiter between each character using regex
// 'abcd' -> 'a|b|c|d' -> split on '|'
RexNode regexPattern = builder.makeLiteral("(?<=.)(?=.)");
RexNode replacement = builder.makeLiteral("|");

// Use REGEXP_REPLACE to insert delimiter between characters
SqlOperator regexpReplace = SqlLibraryOperators.REGEXP_REPLACE_3;
RexNode withDelimiters = builder.makeCall(regexpReplace, str, regexPattern, replacement);

// Then split on the inserted delimiter
RexNode pipeDelimiter = builder.makeLiteral("|");
RexNode splitChars = builder.makeCall(SqlLibraryOperators.SPLIT, withDelimiters, pipeDelimiter);

// For non-empty delimiter: use standard SPLIT
RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);

// Use CASE to choose between the two approaches
// CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
return builder.makeCall(SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP;
import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP;
Expand Down Expand Up @@ -285,6 +286,7 @@
import org.opensearch.sql.exception.ExpressionEvaluationException;
import org.opensearch.sql.executor.QueryType;
import org.opensearch.sql.expression.function.CollectionUDF.MVIndexFunctionImp;
import org.opensearch.sql.expression.function.CollectionUDF.SplitFunctionImp;

public class PPLFuncImpTable {
private static final Logger logger = LogManager.getLogger(PPLFuncImpTable.class);
Expand Down Expand Up @@ -975,6 +977,12 @@ void populate() {
builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter),
PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));

// Register SPLIT with custom logic for empty delimiter
register(
SPLIT,
new SplitFunctionImp(),
PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));

// Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
register(
MVINDEX,
Expand Down
54 changes: 54 additions & 0 deletions docs/user/ppl/functions/collection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,60 @@ Example::
| 120 |
+--------+

SPLIT
-----

Description
>>>>>>>>>>>

Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array.

Argument type: str: STRING, delimiter: STRING

Return type: ARRAY of STRING

Example::

os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
fetched rows / total rows = 1/1
+------------------------------------+
| result |
|------------------------------------|
| [buttercup,rarity,tenderhoof,dash] |
+------------------------------------+

os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
fetched rows / total rows = 1/1
+------------------+
| result |
|------------------|
| [1a2b3c4,567890] |
+------------------+

os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1
fetched rows / total rows = 1/1
+-----------+
| result |
|-----------|
| [a,b,c,d] |
+-----------+

os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1
fetched rows / total rows = 1/1
+--------------+
| result |
|--------------|
| [name,value] |
+--------------+

os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1
fetched rows / total rows = 1/1
+---------+
| result |
|---------|
| [hello] |
+---------+

MVJOIN
------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -489,4 +489,43 @@ public void testMvindexRangeSingleElement() throws IOException {
verifySchema(actual, schema("result", "array"));
verifyDataRows(actual, rows(List.of(3)));
}

@Test
public void testSplitWithSemicolonDelimiter() throws IOException {
JSONObject actual =
executeQuery(
String.format(
"source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result ="
+ " split(test, ';') | head 1 | fields result",
TEST_INDEX_BANK));

verifySchema(actual, schema("result", "array"));
verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh")));
}

@Test
public void testSplitWithMultiCharDelimiter() throws IOException {
JSONObject actual =
executeQuery(
String.format(
"source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+ " fields result",
TEST_INDEX_BANK));

verifySchema(actual, schema("result", "array"));
verifyDataRows(actual, rows(List.of("1a2b3c4", "567890")));
}

@Test
public void testSplitWithEmptyDelimiter() throws IOException {
JSONObject actual =
executeQuery(
String.format(
"source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result",
TEST_INDEX_BANK));

verifySchema(actual, schema("result", "array"));
// Empty delimiter splits into individual characters
verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
}
}
1 change: 1 addition & 0 deletions ppl/src/main/antlr/OpenSearchPPLLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ ARRAY_LENGTH: 'ARRAY_LENGTH';
MVAPPEND: 'MVAPPEND';
MVJOIN: 'MVJOIN';
MVINDEX: 'MVINDEX';
SPLIT: 'SPLIT';
FORALL: 'FORALL';
FILTER: 'FILTER';
TRANSFORM: 'TRANSFORM';
Expand Down
1 change: 1 addition & 0 deletions ppl/src/main/antlr/OpenSearchPPLParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,7 @@ collectionFunctionName
| MVAPPEND
| MVJOIN
| MVINDEX
| SPLIT
| FORALL
| EXISTS
| FILTER
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,81 @@ public void testMvindexRangeNegative() {
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}

@Test
public void testSplitWithSemicolonDelimiter() {
String ppl =
"source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head"
+ " 1 | fields result";
RelNode root = getRelNode(ppl);

String expectedLogical =
"LogicalProject(result=[$9])\n"
+ " LogicalSort(fetch=[1])\n"
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
+ " result=[CASE(=(';', ''),"
+ " SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof':VARCHAR, '(?<=.)(?=.)', '|'),"
+ " '|'), SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
+ " LogicalTableScan(table=[[scott, EMP]])\n";
verifyLogical(root, expectedLogical);

String expectedSparkSql =
"SELECT CASE WHEN ';' = '' THEN SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof', "
+ "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
+ "`result`\n"
+ "FROM `scott`.`EMP`\n"
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}

@Test
public void testSplitWithMultiCharDelimiter() {
String ppl =
"source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+ " fields result";
RelNode root = getRelNode(ppl);

String expectedLogical =
"LogicalProject(result=[$9])\n"
+ " LogicalSort(fetch=[1])\n"
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
+ " result=[CASE(=('def':VARCHAR, ''), SPLIT(REGEXP_REPLACE('1a2b3c4def567890':VARCHAR,"
+ " '(?<=.)(?=.)', '|'), '|'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
+ " LogicalTableScan(table=[[scott, EMP]])\n";
verifyLogical(root, expectedLogical);

String expectedSparkSql =
"SELECT CASE WHEN 'def' = '' THEN SPLIT(REGEXP_REPLACE('1a2b3c4def567890', "
+ "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
+ "FROM `scott`.`EMP`\n"
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}

@Test
public void testSplitWithEmptyDelimiter() {
String ppl =
"source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result";
RelNode root = getRelNode(ppl);

// With empty delimiter, should split into individual characters
String expectedLogical =
"LogicalProject(result=[$9])\n"
+ " LogicalSort(fetch=[1])\n"
+ " LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+ " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
+ " result=[CASE(=('':VARCHAR, ''), SPLIT(REGEXP_REPLACE('abcd':VARCHAR,"
+ " '(?<=.)(?=.)', '|'), '|'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
+ " LogicalTableScan(table=[[scott, EMP]])\n";
verifyLogical(root, expectedLogical);

String expectedSparkSql =
"SELECT CASE WHEN '' = '' THEN SPLIT(REGEXP_REPLACE('abcd', '(?<=.)(?=.)', '|'), '|') "
+ "ELSE SPLIT('abcd', '') END `result`\n"
+ "FROM `scott`.`EMP`\n"
+ "LIMIT 1";
verifyPPLToSparkSQL(root, expectedSparkSql);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,22 @@ public void testMvindex() {
anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result"));
}

@Test
public void testSplit() {
// Test split with delimiter
assertEquals(
"source=table | eval identifier=split(***,***) | fields + identifier",
anonymize("source=t | eval result=split('a;b;c', ';') | fields result"));
// Test split with field reference
assertEquals(
"source=table | eval identifier=split(identifier,***) | fields + identifier",
anonymize("source=t | eval result=split(text, ',') | fields result"));
// Test split with empty delimiter (splits into characters)
assertEquals(
"source=table | eval identifier=split(***,***) | fields + identifier",
anonymize("source=t | eval result=split('abcd', '') | fields result"));
}

@Test
public void testRexWithOffsetField() {
when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10);
Expand Down
Loading