Skip to content

Commit 6541b28

Browse files
committed
actually verified test coverage
[ghstack-poisoned]
1 parent 3d59208 commit 6541b28

File tree

11 files changed

+136
-24
lines changed

11 files changed

+136
-24
lines changed

kernels/portable/CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,15 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
6969
target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
7070
target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
7171
target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
72+
gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
73+
generate_bindings_for_kernels(
74+
LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
75+
)
76+
gen_operators_lib(
77+
LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core
78+
)
7279
install(
73-
TARGETS optimized_portable_kernels
80+
TARGETS optimized_portable_kernels optimized_portable_ops_lib
7481
DESTINATION lib
7582
)
7683
endif()

kernels/portable/cpu/op_add.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,18 @@ Tensor& add_scalar_out(
102102
static constexpr const char op_name[] = "add.Scalar_out";
103103

104104
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
105+
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
106+
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
107+
auto val_alpha_times_b = val_alpha * val_b;
105108
utils::apply_unitensor_elementwise_fn<
106109
CTYPE_COMPUTE,
107110
op_name,
108111
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
109-
[b, alpha](const auto val_a) {
110-
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
111-
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
112-
return val_a + val_alpha * val_b;
112+
[val_alpha_times_b](const auto val_a) {
113+
// Cast here supports vectorization; either it does nothing
114+
// or it casts from CTYPE_COMPUTE to
115+
// Vectorized<CTYPE_COMPUTE>.
116+
return val_a + decltype(val_a)(val_alpha_times_b);
113117
},
114118
ctx,
115119
a,

kernels/portable/cpu/op_clamp.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,8 @@ Tensor& clamp_out(
138138
CTYPE_COMPUTE,
139139
op_name,
140140
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
141-
[has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
142-
// TODO: rewrite this to be vectorization-capable.
143-
CTYPE_COMPUTE val_out = val_in;
141+
[has_min, min_opt, has_max, max_opt](const auto val_in) {
142+
auto val_out = val_in;
144143
if (has_min) {
145144
val_out = utils::max_override(
146145
val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));

kernels/portable/cpu/op_native_dropout.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,11 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
5757
}
5858
ET_SWITCH_FLOATHBF16_TYPES(
5959
input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
60-
utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
61-
[](const auto val, const auto mask_val) {
60+
utils::apply_bitensor_elementwise_fn<
61+
CTYPE_COMPUTE,
62+
op_name,
63+
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
64+
[](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
6265
if (!mask_val) {
6366
return static_cast<decltype(val)>(0);
6467
}
@@ -70,8 +73,7 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
7073
mask,
7174
// TODO: should really be just BOOL
7275
utils::SupportedTensorDtypes::BOOL_OR_BYTE,
73-
out,
74-
utils::SupportedTensorDtypes::SAME_AS_COMMON);
76+
out);
7577
});
7678
} else if (input.numel() > 0) {
7779
std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());

kernels/portable/cpu/op_pow.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ Tensor& pow_Tensor_Tensor_out(
5858
op_name,
5959
utils::SupportedTensorDtypes::REALHBF16>(
6060
[](const auto val_a, const auto val_b) {
61-
// TODO: rewrite this to be vectorization-capable.
6261
return executorch::math::pow(val_a, val_b);
6362
},
6463
ctx,
@@ -111,8 +110,13 @@ Tensor& pow_Tensor_Scalar_out(
111110
CTYPE_COMPUTE,
112111
op_name,
113112
utils::SupportedTensorDtypes::REALHBF16>(
114-
// TODO: rewrite this to be vectorization-capable.
115-
[val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
113+
// Casting val_b here supports vectorization; it does
114+
// nothing if we are not vectorizing (casts to
115+
// CTYPE_COMPUTE) and casts to a vectorized type
116+
// otherwise.
117+
[val_b](const auto val_a) {
118+
return executorch::math::pow(val_a, decltype(val_a)(val_b));
119+
},
116120
ctx,
117121
a,
118122
utils::SupportedTensorDtypes::REALHBBF16,
@@ -161,8 +165,13 @@ Tensor& pow_Scalar_out(
161165
CTYPE_COMPUTE,
162166
op_name,
163167
utils::SupportedTensorDtypes::REALHBF16>(
164-
// TODO: rewrite this to be vectorization-capable.
165-
[val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
168+
// Casting val_a here supports vectorization; it does
169+
// nothing if we are not vectorizing (casts to
170+
// CTYPE_COMPUTE) and casts to a vectorized type
171+
// otherwise.
172+
[val_a](const auto val_b) {
173+
return executorch::math::pow(decltype(val_b)(val_a), val_b);
174+
},
166175
ctx,
167176
b,
168177
utils::SupportedTensorDtypes::REALHBBF16,

kernels/portable/cpu/util/elementwise_util.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,11 +131,22 @@ inline void dtype_specialized_elementwise_fn_impl(
131131
const auto vectorized_end = end - (end % Vec::size());
132132
// Scalar prologue.
133133
for (const auto idx : c10::irange(begin, vectorized_begin)) {
134+
// In debug mode, always use Vectorized so that even
135+
// small-sized tests will test whether using Vectorized broke our
136+
// lambda.
137+
#ifndef NDEBUG
138+
std::array<Vec, kNumInputs> loaded_inputs;
139+
#else // NDEBUG
134140
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
141+
#endif // NDEBUG
135142
for (const auto input_idx : c10::irange(kNumInputs)) {
136143
loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
137144
}
145+
#ifndef NDEBUG
146+
std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
147+
#else // NDEBUG
138148
data_out[idx] = std::apply(compute_fun, loaded_inputs);
149+
#endif // NDEBUG
139150
}
140151

141152
// Main vectorized loop.
@@ -152,11 +163,19 @@ inline void dtype_specialized_elementwise_fn_impl(
152163

153164
// Scalar epilogue.
154165
for (const auto idx : c10::irange(vectorized_end, end)) {
166+
#ifndef NDEBUG
167+
std::array<Vec, kNumInputs> loaded_inputs;
168+
#else // NDEBUG
155169
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
170+
#endif // NDEBUG
156171
for (const auto input_idx : c10::irange(kNumInputs)) {
157172
loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
158173
}
174+
#ifndef NDEBUG
175+
std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
176+
#else // NDEBUG
159177
data_out[idx] = std::apply(compute_fun, loaded_inputs);
178+
#endif // NDEBUG
160179
}
161180
});
162181
return;

kernels/portable/cpu/util/math_util.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,23 @@ at::vec::Vectorized<T> min_override(
150150
return at::vec::minimum(a, b);
151151
}
152152

153+
template <typename T>
154+
at::vec::Vectorized<T> min_override(at::vec::Vectorized<T> a, T b) {
155+
return min_override(a, at::vec::Vectorized<T>(b));
156+
}
157+
153158
template <typename T>
154159
at::vec::Vectorized<T> max_override(
155160
at::vec::Vectorized<T> a,
156161
at::vec::Vectorized<T> b) {
157162
return at::vec::maximum(a, b);
158163
}
164+
165+
template <typename T>
166+
at::vec::Vectorized<T> max_override(at::vec::Vectorized<T> a, T b) {
167+
return max_override(a, at::vec::Vectorized<T>(b));
168+
}
169+
159170
#endif
160171
/**
161172
* There is a slight difference in how std::fmod works compared to how ATen

kernels/test/CMakeLists.txt

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
1717

1818
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
1919

20-
set(_kernels portable optimized quantized)
20+
set(_kernels portable optimized_portable optimized quantized)
2121
foreach(kernel ${_kernels})
2222
set(_wrapper_dir
2323
"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/test"
@@ -37,13 +37,17 @@ foreach(kernel ${_kernels})
3737
VERBATIM
3838
)
3939

40+
set(_supported_features_kernel ${kernel})
41+
if(${kernel} STREQUAL "optimized_portable")
42+
set(_supported_features_kernel "portable")
43+
endif()
4044
add_custom_command(
4145
OUTPUT "${_wrapper_dir}/supported_features.cpp"
4246
"${_wrapper_dir}/supported_features.h"
4347
COMMAND mkdir -p ${_wrapper_dir}
4448
COMMAND
4549
${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
46-
kernels/${kernel}/test/supported_features_def.yaml >
50+
kernels/${_supported_features_kernel}/test/supported_features_def.yaml >
4751
${_wrapper_dir}/supported_features.cpp
4852
COMMAND
4953
${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
@@ -57,6 +61,11 @@ foreach(kernel ${_kernels})
5761
set(_kernel_ops_lib "optimized_native_cpu_ops_lib")
5862
set(_kernel_ops_lib_path
5963
"${CMAKE_CURRENT_BINARY_DIR}/../../configurations/optimized_native_cpu_ops_lib"
64+
)
65+
elseif(${kernel} STREQUAL "optimized_portable")
66+
set(_kernel_ops_lib "${kernel}_ops_lib")
67+
set(_kernel_ops_lib_path
68+
"${CMAKE_CURRENT_BINARY_DIR}/../../kernels/portable/${kernel}_ops_lib"
6069
)
6170
else()
6271
set(_kernel_ops_lib "${kernel}_ops_lib")
@@ -88,6 +97,9 @@ add_custom_target(
8897
"${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/FunctionHeaderWrapper.h"
8998
"${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.h"
9099
"${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp"
100+
"${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/FunctionHeaderWrapper.h"
101+
"${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.h"
102+
"${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.cpp"
91103
"${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/FunctionHeaderWrapper.h"
92104
"${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/supported_features.h"
93105
"${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/supported_features.cpp"
@@ -297,6 +309,22 @@ set(_optimized_kernels_test_sources
297309
if(TARGET optimized_portable_kernels)
298310
list(APPEND _optimized_kernels_test_sources ${all_test_sources})
299311
list(REMOVE_DUPLICATES _optimized_kernels_test_sources)
312+
313+
# Make sure that we still test optimized versions of portable
314+
# kernels even if they would currently be shadowed by specific
315+
# optimized implementations.
316+
et_cxx_test(
317+
optimized_portable_kernels_test
318+
SOURCES
319+
${all_test_sources}
320+
${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.cpp
321+
EXTRA_LIBS
322+
optimized_portable_kernels
323+
)
324+
add_dependencies(optimized_portable_kernels_test generate_wrapper)
325+
target_include_directories(
326+
optimized_portable_kernels_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable"
327+
)
300328
endif()
301329

302330
et_cxx_test(

kernels/test/op_atan2_test.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,16 @@ TEST(OpAtan2OutTest, SmokeTest) {
4747
EXPECT_TENSOR_CLOSE(out, out_expected);
4848
}
4949

50-
TEST(OpAtan2OutTest, SmokeTestNoBroadcasting) {
50+
TEST(OpAtan2OutTest, SmokeTestNoBroadcastingSameDtype) {
5151
TensorFactory<ScalarType::Double> tfDouble;
52-
TensorFactory<ScalarType::Float> tfFloat;
5352

5453
std::vector<double> a(18);
5554
std::iota(a.begin(), a.end(), -8);
5655
std::vector<double> b(18, 2.0);
5756
Tensor self = tfDouble.make({18}, a);
5857
Tensor other = tfDouble.make({18}, b);
59-
Tensor out = tfFloat.zeros({18});
60-
Tensor out_expected = tfFloat.make(
58+
Tensor out = tfDouble.zeros({18});
59+
Tensor out_expected = tfDouble.make(
6160
{18},
6261
{-1.3258176636680326,
6362
-1.2924966677897853,

kernels/test/op_clamp_test.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@ using torch::executor::testing::TensorFactory;
3131

3232
using OptScalar = executorch::aten::optional<Scalar>;
3333

34+
namespace {
35+
template <typename T>
36+
std::vector<T> arange(T stop) {
37+
std::vector<T> result(stop);
38+
std::iota(result.begin(), result.end(), 0);
39+
return result;
40+
}
41+
} // namespace
42+
3443
class OpClampOutTest : public OperatorTest {
3544
protected:
3645
Tensor& op_clamp_out(
@@ -114,6 +123,31 @@ class OpClampOutTest : public OperatorTest {
114123
// Should set all elements to max.
115124
{6, 6, 6, 6}, // expected_data
116125
},
126+
{
127+
std::string(__func__) + ": Simple clamp larger data",
128+
{18}, // sizes
129+
arange<typename ClampTestCase<DTYPE>::ctype>(18), // input_data
130+
OptScalar(1), // min
131+
OptScalar(6), // max
132+
{1,
133+
1,
134+
2,
135+
3,
136+
4,
137+
5,
138+
6,
139+
6,
140+
6,
141+
6,
142+
6,
143+
6,
144+
6,
145+
6,
146+
6,
147+
6,
148+
6,
149+
6}, // expected_data
150+
},
117151
};
118152

119153
run_test_cases(test_cases);

0 commit comments

Comments
 (0)