opensearch-project · ahkcs · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
@@ -75,6 +75,7 @@ public enum BuiltinFunctionName {
   MVAPPEND(FunctionName.of("mvappend")),
   MVJOIN(FunctionName.of("mvjoin")),
   MVINDEX(FunctionName.of("mvindex")),
+  SPLIT(FunctionName.of("split")),
   FORALL(FunctionName.of("forall")),
   EXISTS(FunctionName.of("exists")),
   FILTER(FunctionName.of("filter")),

@@ -0,0 +1,73 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.sql.expression.function.CollectionUDF;
+
+import org.apache.calcite.rex.RexBuilder;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.sql.SqlOperator;
+import org.apache.calcite.sql.fun.SqlLibraryOperators;
+import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.opensearch.sql.expression.function.PPLFuncImpTable;
+
+/**
+ * SPLIT function implementation that splits strings by delimiter.
+ *
+ * <p>Usage: split(str, delimiter)
+ *
+ * <p>Returns an array of strings split on the delimiter.
+ *
+ * <p>Special behavior:
+ *
+ * <ul>
+ *   <li>Empty delimiter ("") splits into individual characters
+ *   <li>If delimiter not found, returns array with original string
+ *   <li>Empty string returns empty array
+ * </ul>
+ *
+ * <p>Implementation notes:
+ *
+ * <ul>
+ *   <li>Uses Calcite's SPLIT for non-empty delimiters
+ *   <li>Uses custom character splitting for empty delimiter via REGEXP_REPLACE
+ * </ul>
+ */
+public class SplitFunctionImp implements PPLFuncImpTable.FunctionImp {
+
+  @Override
+  public RexNode resolve(RexBuilder builder, RexNode... args) {
+    RexNode str = args[0];
+    RexNode delimiter = args[1];
+
+    // Check if delimiter is empty string
+    // If empty, split into individual characters using a workaround
+    // If not empty, use Calcite's SPLIT function
+
+    // Create condition: delimiter = ''
+    RexNode emptyString = builder.makeLiteral("");
+    RexNode isEmptyDelimiter = builder.makeCall(SqlStdOperatorTable.EQUALS, delimiter, emptyString);
+
+    // For empty delimiter: split into characters
+    // Pattern: Insert a delimiter between each character using regex
+    // 'abcd' -> 'a|b|c|d' -> split on '|'
+    RexNode regexPattern = builder.makeLiteral("(?<=.)(?=.)");
+    RexNode replacement = builder.makeLiteral("|");
+
+    // Use REGEXP_REPLACE to insert delimiter between characters
+    SqlOperator regexpReplace = SqlLibraryOperators.REGEXP_REPLACE_3;
+    RexNode withDelimiters = builder.makeCall(regexpReplace, str, regexPattern, replacement);
+
+    // Then split on the inserted delimiter
+    RexNode pipeDelimiter = builder.makeLiteral("|");
+    RexNode splitChars = builder.makeCall(SqlLibraryOperators.SPLIT, withDelimiters, pipeDelimiter);
+
+    // For non-empty delimiter: use standard SPLIT
+    RexNode normalSplit = builder.makeCall(SqlLibraryOperators.SPLIT, str, delimiter);
+
+    // Use CASE to choose between the two approaches
+    // CASE WHEN isEmptyDelimiter THEN splitChars ELSE normalSplit END
+    return builder.makeCall(SqlStdOperatorTable.CASE, isEmptyDelimiter, splitChars, normalSplit);
+  }
+}
@@ -193,6 +193,7 @@
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SINH;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPAN_BUCKET;
+import static org.opensearch.sql.expression.function.BuiltinFunctionName.SPLIT;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.SQRT;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_POP;
 import static org.opensearch.sql.expression.function.BuiltinFunctionName.STDDEV_SAMP;
@@ -285,6 +286,7 @@
 import org.opensearch.sql.exception.ExpressionEvaluationException;
 import org.opensearch.sql.executor.QueryType;
 import org.opensearch.sql.expression.function.CollectionUDF.MVIndexFunctionImp;
+import org.opensearch.sql.expression.function.CollectionUDF.SplitFunctionImp;
 
 public class PPLFuncImpTable {
   private static final Logger logger = LogManager.getLogger(PPLFuncImpTable.class);
@@ -975,6 +977,12 @@ void populate() {
                   builder.makeCall(SqlLibraryOperators.ARRAY_JOIN, array, delimiter),
           PPLTypeChecker.family(SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER));
 
+      // Register SPLIT with custom logic for empty delimiter
+      register(
+          SPLIT,
+          new SplitFunctionImp(),
+          PPLTypeChecker.family(SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER));
+
       // Register MVINDEX to use Calcite's ITEM/ARRAY_SLICE with index normalization
       register(
           MVINDEX,

@@ -186,6 +186,60 @@ Example::
     | 120    |
     +--------+
 
+SPLIT
+-----
+
+Description
+>>>>>>>>>>>
+
+Usage: split(str, delimiter) splits the string values on the delimiter and returns the string values as a multivalue field (array). Use an empty string ("") to split the original string into one value per character. If the delimiter is not found, returns an array containing the original string. If the input string is empty, returns an empty array.
+
+Argument type: str: STRING, delimiter: STRING
+
+Return type: ARRAY of STRING
+
+Example::
+
+    os> source=people | eval test = 'buttercup;rarity;tenderhoof;dash', result = split(test, ';') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +------------------------------------+
+    | result                             |
+    |------------------------------------|
+    | [buttercup,rarity,tenderhoof,dash] |
+    +------------------------------------+
+
+    os> source=people | eval test = '1a2b3c4def567890', result = split(test, 'def') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +------------------+
+    | result           |
+    |------------------|
+    | [1a2b3c4,567890] |
+    +------------------+
+
+    os> source=people | eval test = 'abcd', result = split(test, '') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +-----------+
+    | result    |
+    |-----------|
+    | [a,b,c,d] |
+    +-----------+
+
+    os> source=people | eval test = 'name::value', result = split(test, '::') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +--------------+
+    | result       |
+    |--------------|
+    | [name,value] |
+    +--------------+
+
+    os> source=people | eval test = 'hello', result = split(test, ',') | fields result | head 1
+    fetched rows / total rows = 1/1
+    +---------+
+    | result  |
+    |---------|
+    | [hello] |
+    +---------+
+
 MVJOIN
 ------
 

@@ -489,4 +489,43 @@ public void testMvindexRangeSingleElement() throws IOException {
     verifySchema(actual, schema("result", "array"));
     verifyDataRows(actual, rows(List.of(3)));
   }
+
+  @Test
+  public void testSplitWithSemicolonDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'buttercup;rarity;tenderhoof;dash;mcintosh', result ="
+                    + " split(test, ';') | head 1 | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("buttercup", "rarity", "tenderhoof", "dash", "mcintosh")));
+  }
+
+  @Test
+  public void testSplitWithMultiCharDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+                    + " fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    verifyDataRows(actual, rows(List.of("1a2b3c4", "567890")));
+  }
+
+  @Test
+  public void testSplitWithEmptyDelimiter() throws IOException {
+    JSONObject actual =
+        executeQuery(
+            String.format(
+                "source=%s | eval test = 'abcd', result = split(test, '') | head 1 | fields result",
+                TEST_INDEX_BANK));
+
+    verifySchema(actual, schema("result", "array"));
+    // Empty delimiter splits into individual characters
+    verifyDataRows(actual, rows(List.of("a", "b", "c", "d")));
+  }
 }
@@ -442,6 +442,7 @@ ARRAY_LENGTH:                       'ARRAY_LENGTH';
 MVAPPEND:                           'MVAPPEND';
 MVJOIN:                             'MVJOIN';
 MVINDEX:                            'MVINDEX';
+SPLIT:                              'SPLIT';
 FORALL:                             'FORALL';
 FILTER:                             'FILTER';
 TRANSFORM:                          'TRANSFORM';

@@ -1095,6 +1095,7 @@ collectionFunctionName
     | MVAPPEND
     | MVJOIN
     | MVINDEX
+    | SPLIT
     | FORALL
     | EXISTS
     | FILTER

@@ -214,4 +214,81 @@ public void testMvindexRangeNegative() {
             + "LIMIT 1";
     verifyPPLToSparkSQL(root, expectedSparkSql);
   }
+
+  @Test
+  public void testSplitWithSemicolonDelimiter() {
+    String ppl =
+        "source=EMP | eval test = 'buttercup;rarity;tenderhoof', result = split(test, ';') | head"
+            + " 1 | fields result";
+    RelNode root = getRelNode(ppl);
+
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['buttercup;rarity;tenderhoof':VARCHAR],"
+            + " result=[CASE(=(';', ''),"
+            + " SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof':VARCHAR, '(?<=.)(?=.)', '|'),"
+            + " '|'), SPLIT('buttercup;rarity;tenderhoof':VARCHAR, ';'))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN ';' = '' THEN SPLIT(REGEXP_REPLACE('buttercup;rarity;tenderhoof', "
+            + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('buttercup;rarity;tenderhoof', ';') END "
+            + "`result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
+
+  @Test
+  public void testSplitWithMultiCharDelimiter() {
+    String ppl =
+        "source=EMP | eval test = '1a2b3c4def567890', result = split(test, 'def') | head 1 |"
+            + " fields result";
+    RelNode root = getRelNode(ppl);
+
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['1a2b3c4def567890':VARCHAR],"
+            + " result=[CASE(=('def':VARCHAR, ''), SPLIT(REGEXP_REPLACE('1a2b3c4def567890':VARCHAR,"
+            + " '(?<=.)(?=.)', '|'), '|'), SPLIT('1a2b3c4def567890':VARCHAR, 'def':VARCHAR))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN 'def' = '' THEN SPLIT(REGEXP_REPLACE('1a2b3c4def567890', "
+            + "'(?<=.)(?=.)', '|'), '|') ELSE SPLIT('1a2b3c4def567890', 'def') END `result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
+
+  @Test
+  public void testSplitWithEmptyDelimiter() {
+    String ppl =
+        "source=EMP | eval test = 'abcd', result = split(test, '') | head 1 | fields result";
+    RelNode root = getRelNode(ppl);
+
+    // With empty delimiter, should split into individual characters
+    String expectedLogical =
+        "LogicalProject(result=[$9])\n"
+            + "  LogicalSort(fetch=[1])\n"
+            + "    LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],"
+            + " SAL=[$5], COMM=[$6], DEPTNO=[$7], test=['abcd':VARCHAR],"
+            + " result=[CASE(=('':VARCHAR, ''), SPLIT(REGEXP_REPLACE('abcd':VARCHAR,"
+            + " '(?<=.)(?=.)', '|'), '|'), SPLIT('abcd':VARCHAR, '':VARCHAR))])\n"
+            + "      LogicalTableScan(table=[[scott, EMP]])\n";
+    verifyLogical(root, expectedLogical);
+
+    String expectedSparkSql =
+        "SELECT CASE WHEN '' = '' THEN SPLIT(REGEXP_REPLACE('abcd', '(?<=.)(?=.)', '|'), '|') "
+            + "ELSE SPLIT('abcd', '') END `result`\n"
+            + "FROM `scott`.`EMP`\n"
+            + "LIMIT 1";
+    verifyPPLToSparkSQL(root, expectedSparkSql);
+  }
 }
@@ -822,6 +822,22 @@ public void testMvindex() {
         anonymize("source=t | eval result=mvindex(array(1, 2, 3, 4, 5), 1, 3) | fields result"));
   }
 
+  @Test
+  public void testSplit() {
+    // Test split with delimiter
+    assertEquals(
+        "source=table | eval identifier=split(***,***) | fields + identifier",
+        anonymize("source=t | eval result=split('a;b;c', ';') | fields result"));
+    // Test split with field reference
+    assertEquals(
+        "source=table | eval identifier=split(identifier,***) | fields + identifier",
+        anonymize("source=t | eval result=split(text, ',') | fields result"));
+    // Test split with empty delimiter (splits into characters)
+    assertEquals(
+        "source=table | eval identifier=split(***,***) | fields + identifier",
+        anonymize("source=t | eval result=split('abcd', '') | fields result"));
+  }
+
   @Test
   public void testRexWithOffsetField() {
     when(settings.getSettingValue(Key.PPL_REX_MAX_MATCH_LIMIT)).thenReturn(10);