Skip to content

Commit 2107afa

Browse files
Clean up optimized op_add & op_sub
Differential Revision: D81199582 Pull Request resolved: #13764
1 parent f852949 commit 2107afa

File tree

6 files changed

+156
-238
lines changed

6 files changed

+156
-238
lines changed

kernels/optimized/cpu/op_add.cpp

Lines changed: 62 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
#include <ATen/cpu/vec/vec.h>
1111
#include <executorch/kernels/optimized/cpu/binary_ops.h>
1212
#include <executorch/kernels/portable/cpu/scalar_utils.h>
13-
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
13+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
14+
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
1415
#include <executorch/runtime/kernel/kernel_includes.h>
1516
#include <executorch/runtime/platform/assert.h>
1617

@@ -31,6 +32,26 @@ Tensor& opt_add_out(
3132
ScalarType a_type = a.scalar_type();
3233
ScalarType b_type = b.scalar_type();
3334
ScalarType out_type = out.scalar_type();
35+
ScalarType common_type = promoteTypes(a_type, b_type);
36+
37+
ET_KERNEL_CHECK(
38+
ctx,
39+
(canCast(common_type, out_type) &&
40+
check_alpha_type(utils::get_scalar_dtype(alpha), common_type)),
41+
InvalidArgument,
42+
out);
43+
44+
ET_KERNEL_CHECK(
45+
ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
46+
47+
ET_KERNEL_CHECK(
48+
ctx,
49+
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
50+
InvalidArgument,
51+
out);
52+
53+
// @lint-ignore CLANGTIDY facebook-hte-CArray
54+
static constexpr const char op_name[] = "add.out";
3455

3556
if (b.numel() == 1) {
3657
if (executorch::runtime::isComplexType(a_type) ||
@@ -40,13 +61,8 @@ Tensor& opt_add_out(
4061
// output tensors have the same dtype. Support mixed dtypes in the future.
4162
ET_KERNEL_CHECK(
4263
ctx, a_type == b_type && a_type == out_type, InvalidArgument, out);
43-
ET_KERNEL_CHECK(
44-
ctx,
45-
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
46-
InvalidArgument,
47-
out);
4864

49-
ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, "add.out", CTYPE, [&]() {
65+
ET_SWITCH_COMPLEXH_TYPES(out_type, ctx, op_name, CTYPE, [&]() {
5066
CTYPE alpha_val = utils::scalar_to<CTYPE>(alpha);
5167
CTYPE b_val = *b.const_data_ptr<CTYPE>();
5268

@@ -61,14 +77,8 @@ Tensor& opt_add_out(
6177
} else if (
6278
a_type == b_type && a_type == out_type && a_type != ScalarType::Half &&
6379
a_type != ScalarType::BFloat16) {
64-
ET_KERNEL_CHECK(
65-
ctx,
66-
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
67-
InvalidArgument,
68-
out);
69-
70-
ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() {
71-
ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
80+
ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() {
81+
ET_SWITCH_REALB_TYPES(b_type, ctx, op_name, CTYPE_B, [&]() {
7282
CTYPE alpha_val;
7383
ET_KERNEL_CHECK(
7484
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
@@ -91,7 +101,6 @@ Tensor& opt_add_out(
91101
return opt_add_out(ctx, b, a, alpha, out);
92102
}
93103

94-
static constexpr const char op_name[] = "add.out";
95104
return torch::executor::kernels::impl::opt_add_sub_out_impl<false, op_name>(
96105
ctx, a, b, alpha, out);
97106
}
@@ -102,26 +111,29 @@ Tensor& opt_add_scalar_out(
102111
const Scalar& b,
103112
const Scalar& alpha,
104113
Tensor& out) {
105-
(void)ctx;
106-
107114
ScalarType a_type = a.scalar_type();
108-
ScalarType common_type =
109-
utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false);
115+
ScalarType common_type = utils::promote_type_with_scalar(a_type, b);
110116
ScalarType out_type = out.scalar_type();
111117

112-
ET_CHECK(common_type == out_type);
118+
ET_KERNEL_CHECK(
119+
ctx,
120+
(common_type == a_type &&
121+
check_alpha_type(utils::get_scalar_dtype(alpha), common_type)),
122+
InvalidArgument,
123+
out);
113124

114-
if (common_type == ScalarType::Half || common_type == ScalarType::BFloat16) {
115-
common_type = ScalarType::Float;
116-
}
125+
ET_KERNEL_CHECK(
126+
ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
127+
128+
ET_KERNEL_CHECK(
129+
ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out);
117130

118-
// Resize for dynamic shape
119-
auto error = resize_tensor(out, a.sizes());
120-
ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
131+
// @lint-ignore CLANGTIDY facebook-hte-CArray
132+
static constexpr const char op_name[] = "add.Scalar_out";
121133

122134
if (a_type == common_type && a_type == out_type &&
123135
a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
124-
ET_SWITCH_REALB_TYPES(a_type, ctx, "add.Scalar_out", CTYPE, [&]() {
136+
ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() {
125137
CTYPE b_casted = utils::scalar_to<CTYPE>(b);
126138
CTYPE alpha_val;
127139
ET_KERNEL_CHECK(
@@ -137,28 +149,28 @@ Tensor& opt_add_scalar_out(
137149
out.numel());
138150
});
139151
} else {
140-
ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() {
141-
ET_SWITCH_REALB_TYPES(
142-
common_type, ctx, "add.Scalar_out", CTYPE_IN, [&]() {
143-
ET_SWITCH_REALHBBF16_TYPES(
144-
out_type, ctx, "add.Scalar_out", CTYPE_OUT, [&]() {
145-
CTYPE_IN b_casted = utils::scalar_to<CTYPE_IN>(b);
146-
CTYPE_IN alpha_val;
147-
ET_KERNEL_CHECK(
148-
ctx,
149-
utils::extract_scalar(alpha, &alpha_val),
150-
InvalidArgument, );
151-
152-
const size_t n = a.numel();
153-
const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
154-
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
155-
for (auto i = 0; i < n; ++i) {
156-
out_data[i] = static_cast<CTYPE_OUT>(
157-
static_cast<CTYPE_IN>(a_data[i]) +
158-
alpha_val * b_casted);
159-
}
160-
});
161-
});
152+
ScalarType compute_type = utils::internal::get_compute_type(common_type);
153+
154+
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
155+
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
156+
CTYPE_COMPUTE val_alpha;
157+
ET_KERNEL_CHECK(
158+
ctx, utils::extract_scalar(alpha, &val_alpha), InvalidArgument, );
159+
auto val_alpha_times_b = val_alpha * val_b;
160+
utils::apply_unitensor_elementwise_fn<
161+
CTYPE_COMPUTE,
162+
op_name,
163+
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
164+
[val_alpha_times_b](const auto val_a) {
165+
// Cast here supports vectorization; either it does nothing
166+
// or it casts from CTYPE_COMPUTE to
167+
// Vectorized<CTYPE_COMPUTE>.
168+
return val_a + decltype(val_a)(val_alpha_times_b);
169+
},
170+
ctx,
171+
a,
172+
utils::SupportedTensorDtypes::REALHBBF16,
173+
out);
162174
});
163175
}
164176

kernels/optimized/cpu/op_add_sub_impl.h

Lines changed: 26 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#include <ATen/cpu/vec/vec.h>
1111
#include <executorch/kernels/optimized/cpu/binary_ops.h>
1212
#include <executorch/kernels/portable/cpu/scalar_utils.h>
13-
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
13+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
1414
#include <executorch/runtime/kernel/kernel_includes.h>
1515
#include <executorch/runtime/platform/assert.h>
1616

@@ -19,55 +19,6 @@ namespace executor {
1919
namespace kernels {
2020
namespace impl {
2121

22-
namespace {
23-
template <
24-
bool can_cast,
25-
typename CTYPE_A,
26-
typename CTYPE_B,
27-
typename CTYPE_IN,
28-
typename CTYPE_OUT>
29-
struct AddInner;
30-
31-
template <
32-
typename CTYPE_A,
33-
typename CTYPE_B,
34-
typename CTYPE_IN,
35-
typename CTYPE_OUT>
36-
struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
37-
static void
38-
run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
39-
apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
40-
// NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
41-
[alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
42-
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
43-
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
44-
CTYPE_IN value = a_casted + alpha_val * b_casted;
45-
46-
return static_cast<CTYPE_OUT>(value);
47-
},
48-
a,
49-
b,
50-
out);
51-
}
52-
};
53-
54-
template <typename CTYPE_IN>
55-
struct ReportCanCastBug {
56-
static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
57-
ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
58-
}
59-
};
60-
61-
template <
62-
typename CTYPE_A,
63-
typename CTYPE_B,
64-
typename CTYPE_IN,
65-
typename CTYPE_OUT>
66-
struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
67-
: public ReportCanCastBug<CTYPE_IN> {};
68-
69-
} // namespace
70-
7122
using Tensor = executorch::aten::Tensor;
7223
using ScalarType = executorch::aten::ScalarType;
7324

@@ -78,8 +29,6 @@ Tensor& opt_add_sub_out_impl(
7829
const Tensor& b,
7930
const Scalar& alpha,
8031
Tensor& out) {
81-
(void)ctx;
82-
8332
ScalarType a_type = a.scalar_type();
8433
ScalarType b_type = b.scalar_type();
8534
ScalarType out_type = out.scalar_type();
@@ -115,14 +64,6 @@ Tensor& opt_add_sub_out_impl(
11564
}
11665

11766
if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
118-
// Resize for dynamic shape
119-
ET_KERNEL_CHECK_MSG(
120-
ctx,
121-
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
122-
InvalidArgument,
123-
out,
124-
"Failed to resize output tensor.");
125-
12667
ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() {
12768
CTYPE alpha_val;
12869
ET_KERNEL_CHECK(
@@ -202,39 +143,32 @@ Tensor& opt_add_sub_out_impl(
202143
}
203144
});
204145
} else {
205-
ScalarType common_type =
206-
promoteTypes(a_type, b_type, /*half_to_float*/ true);
207-
ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
146+
ScalarType common_type = promoteTypes(a_type, b_type);
147+
ScalarType compute_type =
148+
native::utils::internal::get_compute_type(common_type);
208149

209-
ET_KERNEL_CHECK(
210-
ctx,
211-
resize_to_broadcast_target_size(a, b, out) == Error::Ok,
212-
InvalidArgument,
213-
out);
214-
215-
ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, op_name, CTYPE_A, [&]() {
216-
ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, op_name, CTYPE_B, [&]() {
217-
using CTYPE_IN = typename torch::executor::
218-
promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
219-
ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
220-
ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, op_name, CTYPE_OUT, [&]() {
221-
CTYPE_IN alpha_val;
222-
ET_KERNEL_CHECK(
223-
ctx,
224-
torch::executor::native::utils::extract_scalar(alpha, &alpha_val),
225-
InvalidArgument, );
226-
if constexpr (is_sub) {
227-
alpha_val = -alpha_val;
228-
}
229-
230-
AddInner<
231-
can_cast<CTYPE_IN, CTYPE_OUT>::value,
232-
CTYPE_A,
233-
CTYPE_B,
234-
CTYPE_IN,
235-
CTYPE_OUT>::run(a, b, alpha_val, out);
236-
});
237-
});
150+
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
151+
CTYPE_COMPUTE val_alpha;
152+
ET_KERNEL_CHECK(
153+
ctx,
154+
native::utils::extract_scalar(alpha, &val_alpha),
155+
InvalidArgument, );
156+
if constexpr (is_sub) {
157+
val_alpha = -val_alpha;
158+
}
159+
native::utils::apply_bitensor_elementwise_fn<
160+
CTYPE_COMPUTE,
161+
op_name,
162+
native::utils::SupportedTensorDtypes::REALHBBF16>(
163+
[val_alpha](const auto val_a, const auto val_b) {
164+
return val_a + val_alpha * val_b;
165+
},
166+
ctx,
167+
a,
168+
native::utils::SupportedTensorDtypes::REALHBBF16,
169+
b,
170+
native::utils::SupportedTensorDtypes::REALHBBF16,
171+
out);
238172
});
239173
}
240174

0 commit comments

Comments
 (0)