Delete opt_mul_scalar_out (#12145)

swolchok · web-flow · commit 2d095b8bd082 · 2025-07-08T14:14:11.000-07:00
The handwritten optimized code is similar to what we should be getting from the optimized portable op, as follows. handwritten optimized code: - if the input type matches the output type, then perform a vectorized loop - otherwise, generate specific mixed-dtype kernels, which aren't vectorized. optimized portable op: - if the input type matches the output type, then perform a vectorized loop. (dtype_specialized_elementwise_fn_impl in elementwise_util.h) - otherwise, generate one specific kernel per compute type. those kernels use non-inlined function calls to do loads and stores, trading off performance for a significant size reduction. (apply_elementwise_fn_generic_impl in elementwise_util.h) Both cases in the portable op variant also use parallel_for. I attempted to do a performance test, but I found that `torch.mul(some_tensor, 2.0)` is exported as a call to mul.Tensor, *not* mul.Scalar. 41e7ffa added the ability to pass our tests if we do emit mul.Scalar for this, but the follow-up diff to make that happen seems not to have landed. So, I think another reason to delete this is that (if I understand correctly) it's not used, therefore we don't have specific knowledge that we need it to exist and we can't just use the optimized portable op.
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -210,63 +210,6 @@ Tensor& opt_mul_out(
   return out;
 }
 
-Tensor& opt_mul_scalar_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Scalar& b,
-    Tensor& out) {
-  (void)ctx;
-
-  ScalarType a_type = a.scalar_type();
-  ScalarType common_type =
-      utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false);
-  ScalarType out_type = out.scalar_type();
-
-  ET_CHECK(common_type == out_type);
-
-  if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
-    common_type = ScalarType::Float;
-  }
-
-  // Resize for dynamic shape
-  auto error = resize_tensor(out, a.sizes());
-  ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
-
-  if (a_type == common_type && a_type == out_type &&
-      a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
-    ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE, [&]() {
-      CTYPE b_casted = utils::scalar_to<CTYPE>(b);
-
-      using Vec = at::vec::Vectorized<CTYPE>;
-      at::vec::map<CTYPE>(
-          [b_casted](Vec x) { return x * Vec(b_casted); },
-          out.mutable_data_ptr<CTYPE>(),
-          a.const_data_ptr<CTYPE>(),
-          out.numel());
-    });
-  } else {
-    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "mul.Scalar_out", CTYPE_A, [&]() {
-      ET_SWITCH_REALB_TYPES(
-          common_type, ctx, "mul.Scalar_out", CTYPE_IN, [&]() {
-            ET_SWITCH_REALHBBF16_TYPES(
-                out_type, ctx, "mul.Scalar_out", CTYPE_OUT, [&]() {
-                  CTYPE_IN b_casted = utils::scalar_to<CTYPE_IN>(b);
-
-                  const size_t n = a.numel();
-                  const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
-                  CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-                  for (auto i = 0; i < n; ++i) {
-                    out_data[i] = static_cast<CTYPE_OUT>(
-                        static_cast<CTYPE_IN>(a_data[i]) * b_casted);
-                  }
-                });
-          });
-    });
-  }
-
-  return out;
-}
-
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
@@ -82,11 +82,6 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_mul_out
 
-- op: mul.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_mul_scalar_out
-
 - op: native_layer_norm.out
   kernels:
     - arg_meta: null