Add TRANSFORM_WITH_INDEX UDF (#15978)

abhinavmuk04 · meta-codesync[bot] · commit cfc3353a290b · 2026-02-25T14:38:06.000-08:00
Summary: Pull Request resolved: #15978 Add a new TRANSFORM_WITH_INDEX function that allows transformation of array elements with access to their index, enabling more flexible feature engineering. The existing TRANSFORM function only passes the element to the lambda, making it difficult to use indices in array transform operations. This new function passes both the element and its 1-based index to the lambda: ```sql TRANSFORM_WITH_INDEX(arr, (elem, index) -> ...) ``` ## Function Signature ``` transform_with_index(array(T), function(T, bigint, U)) -> array(U) ``` ## Examples ```sql SELECT transform_with_index(ARRAY [5, 6, 7], (x, i) -> x * i); -- [5, 12, 21] SELECT transform_with_index(ARRAY ['a', 'b', 'c'], (x, i) -> concat(x, cast(i as varchar))); -- ['a1', 'b2', 'c3'] SELECT transform_with_index(ARRAY [10, 20, 30], (x, i) -> i); -- [1, 2, 3] ``` ## Implementation Details - Uses 1-based indexing for Presto compatibility - Follows the same pattern as the existing TRANSFORM function - Added to fuzzer exclusion lists as this is a Velox-only function not available in Presto Reviewed By: zacw7 Differential Revision: D90478316 fbshipit-source-id: 2689eeeb3ed3f8e6df8d4950735e9e31eef82e81
diff --git a/velox/docs/functions/presto/array.rst b/velox/docs/functions/presto/array.rst
@@ -458,6 +458,18 @@ Array Functions
         SELECT transform(ARRAY ['x', 'abc', 'z'], x -> x || '0'); -- ['x0', 'abc0', 'z0']
         SELECT transform(ARRAY [ARRAY [1, NULL, 2], ARRAY[3, NULL]], a -> filter(a, x -> x IS NOT NULL)); -- [[1, 2], [3]]
 
+.. function:: transform_with_index(array(T), function(T,bigint,U)) -> array(U)
+
+    Returns an array that is the result of applying ``function`` to each element of ``array``.
+    The lambda function receives both the element and its 1-based index as arguments.
+    This is useful for transformations that need to know the position of each element::
+
+        SELECT transform_with_index(ARRAY [], (x, i) -> x + i); -- []
+        SELECT transform_with_index(ARRAY [5, 6, 7], (x, i) -> x * i); -- [5, 12, 21]
+        SELECT transform_with_index(ARRAY ['a', 'b', 'c'], (x, i) -> concat(x, cast(i as varchar))); -- ['a1', 'b2', 'c3']
+        SELECT transform_with_index(ARRAY [10, 20, 30], (x, i) -> i); -- [1, 2, 3]
+        SELECT transform_with_index(ARRAY [1, 2, 3], (x, i) -> if(i % 2 = 1, x, x * 2)); -- [1, 4, 3]
+
 .. function:: trim_array(x, n) -> array
 
     Remove n elements from the end of ``array``::
diff --git a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp
@@ -299,6 +299,7 @@ std::unordered_set<std::string> skipFunctionsSOT = {
                      // instances
     "array_subset", // Velox-only function, not available in Presto
     "map_values_in_range", // Velox-only function, not available in Presto
+    "transform_with_index", // Velox-only function, not available in Presto
     "remap_keys", // Velox-only function, not available in Presto
     "map_intersect", // Velox-only function, not available in Presto
     "map_keys_overlap", // Velox-only function, not available in Presto
diff --git a/velox/functions/prestosql/CMakeLists.txt b/velox/functions/prestosql/CMakeLists.txt
@@ -53,6 +53,7 @@ velox_add_library(
   Subscript.cpp
   ToUtf8.cpp
   Transform.cpp
+  TransformWithIndex.cpp
   TransformKeys.cpp
   TransformValues.cpp
   TypeOf.cpp
diff --git a/velox/functions/prestosql/TransformWithIndex.cpp b/velox/functions/prestosql/TransformWithIndex.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/expression/Expr.h"
+#include "velox/expression/VectorFunction.h"
+#include "velox/functions/lib/LambdaFunctionUtil.h"
+#include "velox/functions/lib/RowsTranslationUtil.h"
+#include "velox/vector/FunctionVector.h"
+
+namespace facebook::velox::functions {
+namespace {
+
+// transform_with_index(array(T), function(T, bigint, U)) -> array(U)
+//
+// Transforms each element of an array using the provided function.
+// The lambda function receives both the element and its 1-based index.
+class TransformWithIndexFunction : public exec::VectorFunction {
+ public:
+  void apply(
+      const SelectivityVector& rows,
+      std::vector<VectorPtr>& args,
+      const TypePtr& outputType,
+      exec::EvalCtx& context,
+      VectorPtr& result) const override {
+    VELOX_CHECK_EQ(args.size(), 2);
+
+    // Flatten input array.
+    exec::LocalDecodedVector arrayDecoder(context, *args[0], rows);
+    auto& decodedArray = *arrayDecoder.get();
+
+    auto flatArray = flattenArray(rows, args[0], decodedArray);
+
+    auto newNumElements = flatArray->elements()->size();
+
+    // Create indices vector (1-based indexing for Presto compatibility)
+    auto indices = createIndicesVector(flatArray, rows, context.pool());
+
+    std::vector<VectorPtr> lambdaArgs = {flatArray->elements(), indices};
+
+    SelectivityVector validRowsInReusedResult =
+        toElementRows<ArrayVector>(newNumElements, rows, flatArray.get());
+
+    VectorPtr newElements;
+
+    auto elementToTopLevelRows = getElementToTopLevelRows(
+        newNumElements, rows, flatArray.get(), context.pool());
+
+    // Loop over lambda functions and apply these to elements of the base array;
+    // in most cases there will be only one function and the loop will run once
+    auto it = args[1]->asUnchecked<FunctionVector>()->iterator(&rows);
+    while (auto entry = it.next()) {
+      auto elementRows = toElementRows<ArrayVector>(
+          newNumElements, *entry.rows, flatArray.get());
+      auto wrapCapture = toWrapCapture<ArrayVector>(
+          newNumElements, entry.callable, *entry.rows, flatArray);
+
+      entry.callable->apply(
+          elementRows,
+          &validRowsInReusedResult,
+          wrapCapture,
+          &context,
+          lambdaArgs,
+          elementToTopLevelRows,
+          &newElements);
+    }
+
+    // Set nulls for rows not present in 'rows'.
+    BufferPtr newNulls = addNullsForUnselectedRows(flatArray, rows);
+
+    VectorPtr localResult = std::make_shared<ArrayVector>(
+        flatArray->pool(),
+        outputType,
+        std::move(newNulls),
+        rows.end(),
+        flatArray->offsets(),
+        flatArray->sizes(),
+        newElements);
+    context.moveOrCopyResult(localResult, rows, result);
+  }
+
+  static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
+    // array(T), function(T, bigint, U) -> array(U)
+    return {exec::FunctionSignatureBuilder()
+                .typeVariable("T")
+                .typeVariable("U")
+                .returnType("array(U)")
+                .argumentType("array(T)")
+                .argumentType("function(T, bigint, U)")
+                .build()};
+  }
+
+ private:
+  // Creates a vector of 1-based indices for each element in the flattened
+  // array. For example, if we have arrays [[a, b, c], [d, e]], the indices
+  // will be [1, 2, 3, 1, 2] (1-based for each array).
+  static VectorPtr createIndicesVector(
+      const std::shared_ptr<ArrayVector>& flatArray,
+      const SelectivityVector& rows,
+      memory::MemoryPool* pool) {
+    const auto numElements = flatArray->elements()->size();
+    auto indicesVector =
+        BaseVector::create<FlatVector<int64_t>>(BIGINT(), numElements, pool);
+    auto* rawIndices = indicesVector->mutableRawValues();
+
+    const auto* rawOffsets = flatArray->rawOffsets();
+    const auto* rawSizes = flatArray->rawSizes();
+    const auto* rawNulls = flatArray->rawNulls();
+
+    rows.applyToSelected([&](vector_size_t row) {
+      if (rawNulls && bits::isBitNull(rawNulls, row)) {
+        return;
+      }
+      const auto offset = rawOffsets[row];
+      const auto size = rawSizes[row];
+      for (vector_size_t i = 0; i < size; ++i) {
+        // Use 1-based indexing for Presto compatibility
+        rawIndices[offset + i] = i + 1;
+      }
+    });
+
+    return indicesVector;
+  }
+};
+} // namespace
+
+/// transform_with_index is null preserving for the array. But since an
+/// expr tree with a lambda depends on all named fields, including
+/// captures, a null in a capture does not automatically make a
+/// null result.
+
+VELOX_DECLARE_VECTOR_FUNCTION_WITH_METADATA(
+    udf_transform_with_index,
+    TransformWithIndexFunction::signatures(),
+    exec::VectorFunctionMetadataBuilder().defaultNullBehavior(false).build(),
+    std::make_unique<TransformWithIndexFunction>());
+
+} // namespace facebook::velox::functions
diff --git a/velox/functions/prestosql/registration/GeneralFunctionsRegistration.cpp b/velox/functions/prestosql/registration/GeneralFunctionsRegistration.cpp
@@ -91,6 +91,8 @@ void registerGeneralFunctions(const std::string& prefix) {
   registerElementAtFunction(prefix + "element_at", true);
 
   VELOX_REGISTER_VECTOR_FUNCTION(udf_transform, prefix + "transform");
+  VELOX_REGISTER_VECTOR_FUNCTION(
+      udf_transform_with_index, prefix + "transform_with_index");
   VELOX_REGISTER_VECTOR_FUNCTION(udf_reduce, prefix + "reduce");
   registerReduceRewrites(prefix);
   VELOX_REGISTER_VECTOR_FUNCTION(udf_array_filter, prefix + "filter");
diff --git a/velox/functions/prestosql/tests/CMakeLists.txt b/velox/functions/prestosql/tests/CMakeLists.txt
@@ -126,6 +126,7 @@ add_executable(
   TransformKeysTest.cpp
   TransformTest.cpp
   TransformValuesTest.cpp
+  TransformWithIndexTest.cpp
   TrimFunctionsTest.cpp
   TypeOfTest.cpp
   TDigestCastTest.cpp
diff --git a/velox/functions/prestosql/tests/TransformWithIndexTest.cpp b/velox/functions/prestosql/tests/TransformWithIndexTest.cpp