Refactor op_cat to use util pattern

georgehong · facebook-github-bot · commit a512f05d5e20 · 2025-08-05T11:43:36.000-07:00
Summary: Refactor cat op and helper functions to be accessible in a cat-specific target, exposing functionality to code that requires it.

Differential Revision: D79599708
diff --git a/kernels/portable/cpu/op_cat.cpp b/kernels/portable/cpu/op_cat.cpp
@@ -8,6 +8,7 @@
 
 #include <cstring>
 
+#include <executorch/kernels/portable/cpu/util/cat_util.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -37,79 +38,7 @@ Tensor& cat_out(
       resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
       InvalidArgument,
       out);
-
-  // Special handling when all inputs are 1D-empty tensors for aten consistency
-  // In that case, just return an 1D-empty tensor without checking dim
-  bool all_1d_empty = true;
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
-      all_1d_empty = false;
-      break;
-    }
-  }
-  if (all_1d_empty) {
-    return out;
-  }
-
-  const size_t outer = getLeadingDims(out, dim);
-  const size_t dim_stride = getTrailingDims(out, dim);
-  const size_t ninputs = tensors.size();
-
-  const auto out_type = out.scalar_type();
-  const bool out_is_complex =
-      executorch::runtime::isComplexType(out.scalar_type());
-
-  if (out_is_complex) {
-    // TODO: The current support for complex dtype enforces that input and
-    // output tensors have the same dtype. Support mixed dtypes in the future.
-    for (size_t i = 0; i < ninputs; ++i) {
-      const auto in_type = tensors[i].scalar_type();
-      ET_KERNEL_CHECK(ctx, out_type == in_type, InvalidArgument, out);
-    }
-    ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, "cat.out", CTYPE, [&] {
-      CTYPE* out_ptr = out.mutable_data_ptr<CTYPE>();
-      for (size_t i = 0; i < outer; ++i) {
-        for (size_t j = 0; j < ninputs; ++j) {
-          if (tensors[j].numel() == 0) {
-            return;
-          }
-          size_t inner = tensors[j].size(dim) * dim_stride;
-          const CTYPE* const in_ptr =
-              tensors[j].const_data_ptr<CTYPE>() + i * inner;
-          memcpy(out_ptr, in_ptr, inner * sizeof(CTYPE));
-          out_ptr += inner;
-        }
-      }
-    });
-  } else {
-    ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "cat.out", CTYPE_OUT, [&] {
-      CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
-      for (size_t i = 0; i < outer; ++i) {
-        for (size_t j = 0; j < ninputs; ++j) {
-          const auto in_type = tensors[j].scalar_type();
-          ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "cat.out", CTYPE_IN, [&] {
-            if (tensors[j].numel() == 0) {
-              return;
-            }
-            size_t inner = tensors[j].size(dim) * dim_stride;
-            const CTYPE_IN* const in_ptr =
-                tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;
-
-            if (sizeof(CTYPE_IN) == sizeof(CTYPE_OUT)) {
-              memcpy(out_ptr, in_ptr, inner * sizeof(CTYPE_IN));
-            } else {
-              for (size_t k = 0; k < inner; ++k) {
-                out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
-              }
-            }
-            out_ptr += inner;
-          });
-        }
-      }
-    });
-  }
-
-  return out;
+  return cat_out_impl(ctx, tensors, dim, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/util/cat_util.cpp b/kernels/portable/cpu/util/cat_util.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/cat_util.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+
+namespace torch::executor::native {
+
+bool check_cat_args(
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out) {
+  return torch::executor::check_cat_args(
+    tensors, dim, out);
+}
+
+void get_cat_out_target_size(
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    executorch::aten::SizesType* out_sizes,
+    size_t* out_ndim) {
+  torch::executor::get_cat_out_target_size(
+      tensors, dim, out_sizes, out_ndim);
+}
+
+Tensor& cat_out_impl(
+    KernelRuntimeContext& ctx,
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out) {
+  // Special handling when all inputs are 1D-empty tensors for aten consistency
+  // In that case, just return an 1D-empty tensor without checking dim
+  bool all_1d_empty = true;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
+      all_1d_empty = false;
+      break;
+    }
+  }
+  if (all_1d_empty) {
+    return out;
+  }
+
+  const size_t outer = getLeadingDims(out, dim);
+  const size_t dim_stride = getTrailingDims(out, dim);
+  const size_t ninputs = tensors.size();
+
+  const auto out_type = out.scalar_type();
+  const bool out_is_complex =
+      executorch::runtime::isComplexType(out.scalar_type());
+
+  if (out_is_complex) {
+    // TODO: The current support for complex dtype enforces that input and
+    // output tensors have the same dtype. Support mixed dtypes in the future.
+    for (size_t i = 0; i < ninputs; ++i) {
+      const auto in_type = tensors[i].scalar_type();
+      ET_KERNEL_CHECK(ctx, out_type == in_type, InvalidArgument, out);
+    }
+    ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, "cat.out", CTYPE, [&] {
+      CTYPE* out_ptr = out.mutable_data_ptr<CTYPE>();
+      for (size_t i = 0; i < outer; ++i) {
+        for (size_t j = 0; j < ninputs; ++j) {
+          if (tensors[j].numel() == 0) {
+            return;
+          }
+          size_t inner = tensors[j].size(dim) * dim_stride;
+          const CTYPE* const in_ptr =
+              tensors[j].const_data_ptr<CTYPE>() + i * inner;
+          memcpy(out_ptr, in_ptr, inner * sizeof(CTYPE));
+          out_ptr += inner;
+        }
+      }
+    });
+  } else {
+    ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "cat.out", CTYPE_OUT, [&] {
+      CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
+      for (size_t i = 0; i < outer; ++i) {
+        for (size_t j = 0; j < ninputs; ++j) {
+          const auto in_type = tensors[j].scalar_type();
+          ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "cat.out", CTYPE_IN, [&] {
+            if (tensors[j].numel() == 0) {
+              return;
+            }
+            size_t inner = tensors[j].size(dim) * dim_stride;
+            const CTYPE_IN* const in_ptr =
+                tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;
+
+            if (sizeof(CTYPE_IN) == sizeof(CTYPE_OUT)) {
+              memcpy(out_ptr, in_ptr, inner * sizeof(CTYPE_IN));
+            } else {
+              for (size_t k = 0; k < inner; ++k) {
+                out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
+              }
+            }
+            out_ptr += inner;
+          });
+        }
+      }
+    });
+  }
+  return out;
+}
+} // namespace torch::executor::native
diff --git a/kernels/portable/cpu/util/cat_util.h b/kernels/portable/cpu/util/cat_util.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch::executor::native {
+
+bool check_cat_args(
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out);
+
+void get_cat_out_target_size(
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    executorch::aten::SizesType* out_sizes,
+    size_t* out_ndim);
+
+Tensor& cat_out_impl(
+    KernelRuntimeContext& ctx,
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out);
+
+inline Tensor& cat_out_impl(
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out) {
+  KernelRuntimeContext ctx;
+  return cat_out_impl(ctx, tensors, dim, out);
+}
+
+} // namespace torch::executor::native
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:kernel_ops_util",
             "//executorch/kernels/portable/cpu:vec_ops",
             "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+            "//executorch/kernels/portable/cpu/util:cat_util",
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
             "//executorch/kernels/portable/cpu/util:transpose_util",
             "//executorch/kernels/portable/cpu/util:index_util",
@@ -302,6 +303,20 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "cat_util",
+        srcs = ["cat_util.cpp"],
+        exported_headers = ["cat_util.h"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+        ],
+        visibility = [
+            "//executorch/kernels/portable/cpu/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     runtime.cxx_library(
         name = "broadcast_indexes_range",
         exported_headers = ["broadcast_indexes_range.h"],
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -378,6 +378,7 @@ ATEN_OPS = (
     op_target(
         name = "op_cat",
         deps = [
+            "//executorch/kernels/portable/cpu/util:cat_util",
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
         ],
     ),