Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion kernels/optimized/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ if(NOT EXECUTORCH_ROOT)
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
endif()

set(_common_compile_options -Wno-deprecated-declarations)
set(_common_compile_options
$<$<CXX_COMPILER_ID:MSVC>:/wd4996>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
)

# Note for apple platform we can rely on Accelerate framework Will come back to
# this
Expand Down
5 changes: 2 additions & 3 deletions kernels/optimized/cpu/op_bmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,15 +150,14 @@ Tensor& opt_bmm_out(
ET_KERNEL_CHECK(
ctx, check_bmm_out_args(self, mat2, out), InvalidArgument, out);

constexpr auto name = "bmm.out";
auto self_type = self.scalar_type();

if (executorch::runtime::isComplexType(self_type)) {
ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, name, CTYPE, [&]() {
ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, "bmm.out", CTYPE, [&]() {
bmm_kernel<CTYPE>(self, mat2, out);
});
} else {
ET_SWITCH_REALHBF16_TYPES(self_type, ctx, name, CTYPE, [&]() {
ET_SWITCH_REALHBF16_TYPES(self_type, ctx, "bmm.out", CTYPE, [&]() {
bmm_kernel<CTYPE>(self, mat2, out);
});
}
Expand Down
2 changes: 1 addition & 1 deletion kernels/portable/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ endif()

set(_common_compile_options
$<$<CXX_COMPILER_ID:MSVC>:/wd4996>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
)

include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
Expand Down
4 changes: 1 addition & 3 deletions kernels/portable/cpu/op_masked_scatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,11 @@ Tensor& masked_scatter_out(
InvalidArgument,
out);

constexpr auto op_name = "masked_scatter.out";

int64_t idx = 0;
int64_t src_numel = src.numel();
bool src_numel_check = true;

ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE, [&]() {
ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "masked_scatter.out", CTYPE, [&]() {
const CTYPE* const src_data = src.const_data_ptr<CTYPE>();
apply_binary_elementwise_fn<CTYPE, bool, CTYPE>(
[src_data, &idx, &src_numel, &src_numel_check](
Expand Down
7 changes: 5 additions & 2 deletions kernels/portable/cpu/op_topk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <c10/util/irange.h>
#include <cmath>
#include <functional>
#include <tuple>

#include <executorch/kernels/portable/cpu/util/math_util.h>
Expand Down Expand Up @@ -118,10 +119,12 @@ void perform_topk(
}

// Perform topk on the queue
const auto elem_greater = [](const elem_t& x, const elem_t& y) -> bool {
const std::function<bool(const elem_t&, const elem_t&)> elem_greater =
[](const elem_t& x, const elem_t& y) -> bool {
return float_less_than(y.first, x.first);
};
const auto elem_less = [](const elem_t& x, const elem_t& y) -> bool {
const std::function<bool(const elem_t&, const elem_t&)> elem_less =
[](const elem_t& x, const elem_t& y) -> bool {
return float_less_than(x.first, y.first);
};
const auto cmp = largest ? elem_greater : elem_less;
Expand Down
14 changes: 7 additions & 7 deletions kernels/portable/cpu/op_view_as_real_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ Tensor& view_as_real_copy_out(
ET_KERNEL_CHECK(
ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out);

constexpr auto op_name = "view_as_real_copy.out";
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh missed one.


ET_SWITCH_COMPLEXH_TYPES(self.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
_to_impl<CTYPE_IN, CTYPE_OUT>(self, out);
});
});
ET_SWITCH_COMPLEX_TYPES(
self.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_IN, [&] {
ET_SWITCH_FLOATH_TYPES(
out.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_OUT, [&] {
_to_impl<CTYPE_IN, CTYPE_OUT>(self, out);
});
});

return out;
}
Expand Down
5 changes: 4 additions & 1 deletion kernels/portable/cpu/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ endif()

list(TRANSFORM _kernels_util_all_deps__srcs PREPEND "${EXECUTORCH_ROOT}/")

set(_common_compile_options -Wno-deprecated-declarations)
set(_common_compile_options
$<$<CXX_COMPILER_ID:MSVC>:/wd4996>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
)

add_library(kernels_util_all_deps ${_kernels_util_all_deps__srcs})
target_link_libraries(kernels_util_all_deps PRIVATE executorch_core)
Expand Down
48 changes: 23 additions & 25 deletions kernels/portable/cpu/util/elementwise_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ inline void dtype_specialized_elementwise_fn_impl(
static_assert(
(std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
...));
constexpr auto kNumInputs = sizeof...(inputs);
// All inputs must be of type CTYPE_COMPUTE.
ET_DCHECK(
((inputs.first->scalar_type() ==
Expand All @@ -105,8 +104,9 @@ inline void dtype_specialized_elementwise_fn_impl(
out.numel(),
::executorch::extension::internal::GRAIN_SIZE,
[&](const auto begin, const auto end) {
std::array<const CTYPE_COMPUTE*, kNumInputs> inputs_data_ptrs = {
inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};
std::array<const CTYPE_COMPUTE*, sizeof...(inputs)>
inputs_data_ptrs = {
inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};

CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();

Expand All @@ -119,11 +119,11 @@ inline void dtype_specialized_elementwise_fn_impl(
// small-sized tests will test whether using Vectorized broke our
// lambda.
#ifndef NDEBUG
std::array<Vec, kNumInputs> loaded_inputs{};
std::array<Vec, sizeof...(inputs)> loaded_inputs{};
#else // NDEBUG
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs{};
std::array<CTYPE_COMPUTE, sizeof...(inputs)> loaded_inputs{};
#endif // NDEBUG
for (const auto input_idx : c10::irange(kNumInputs)) {
for (const auto input_idx : c10::irange(sizeof...(inputs))) {
loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
}
#ifndef NDEBUG
Expand All @@ -136,8 +136,8 @@ inline void dtype_specialized_elementwise_fn_impl(
// Main vectorized loop.
for (auto idx = vectorized_begin; idx < vectorized_end;
idx += Vec::size()) {
std::array<Vec, kNumInputs> loaded_vec_inputs{};
for (const auto input_idx : c10::irange(kNumInputs)) {
std::array<Vec, sizeof...(inputs)> loaded_vec_inputs{};
for (const auto input_idx : c10::irange(sizeof...(inputs))) {
loaded_vec_inputs[input_idx] =
Vec::loadu(&inputs_data_ptrs[input_idx][idx]);
}
Expand All @@ -148,11 +148,11 @@ inline void dtype_specialized_elementwise_fn_impl(
// Scalar epilogue.
for (const auto idx : c10::irange(vectorized_end, end)) {
#ifndef NDEBUG
std::array<Vec, kNumInputs> loaded_inputs{};
std::array<Vec, sizeof...(inputs)> loaded_inputs{};
#else // NDEBUG
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs{};
std::array<CTYPE_COMPUTE, sizeof...(inputs)> loaded_inputs{};
#endif // NDEBUG
for (const auto input_idx : c10::irange(kNumInputs)) {
for (const auto input_idx : c10::irange(sizeof...(inputs))) {
loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
}
#ifndef NDEBUG
Expand All @@ -172,20 +172,20 @@ inline void dtype_specialized_elementwise_fn_impl(
out.numel(),
::executorch::extension::internal::GRAIN_SIZE,
[&](const auto begin, const auto end) {
std::array<const CTYPE_COMPUTE*, kNumInputs> inputs_data_ptrs = {
std::array<const CTYPE_COMPUTE*, sizeof...(inputs)> inputs_data_ptrs = {
inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};

CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();

const auto range =
BroadcastIndexesRange<kNumInputs, support_noncontiguous_tensors>(
out, (*inputs.first)...);
const auto range = BroadcastIndexesRange<
sizeof...(inputs),
support_noncontiguous_tensors>(out, (*inputs.first)...);
auto begin_it = range.begin();
begin_it += begin;
for (; (*begin_it)[0] < end; ++begin_it) {
const auto& indexes = *begin_it;
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs{};
for (const auto idx : c10::irange(kNumInputs)) {
std::array<CTYPE_COMPUTE, sizeof...(inputs)> loaded_inputs{};
for (const auto idx : c10::irange(sizeof...(inputs))) {
loaded_inputs[idx] = inputs_data_ptrs[idx][indexes[idx + 1]];
}
data_out[indexes[0]] = std::apply(compute_fun, loaded_inputs);
Expand Down Expand Up @@ -229,14 +229,12 @@ inline void apply_elementwise_fn_generic_impl(
const Tensor& out,
SupportedTensorDtypes out_dtypes,
Args... inputs) {
constexpr auto kNumInputs = sizeof...(inputs);

struct InputInfo {
load_to_compute_fn<CTYPE_COMPUTE> load_to_compute;
const char* data_ptr;
ssize_t element_size;
};
std::array<InputInfo, kNumInputs> inputs_info = {(InputInfo{
std::array<InputInfo, sizeof...(inputs)> inputs_info = {(InputInfo{
internal::get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
ctx, *inputs.first, inputs.second),
reinterpret_cast<const char*>(inputs.first->const_data_ptr()),
Expand All @@ -254,15 +252,15 @@ inline void apply_elementwise_fn_generic_impl(
out.numel(),
::executorch::extension::internal::GRAIN_SIZE,
[&](const auto begin, const auto end) {
const auto range =
BroadcastIndexesRange<kNumInputs, support_noncontiguous_tensors>(
out, (*inputs.first)...);
const auto range = BroadcastIndexesRange<
sizeof...(inputs),
support_noncontiguous_tensors>(out, (*inputs.first)...);
auto begin_it = range.begin();
begin_it += begin;
for (; (*begin_it)[0] < end; ++begin_it) {
const auto& indexes = *begin_it;
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs{};
for (const auto idx : c10::irange(kNumInputs)) {
std::array<CTYPE_COMPUTE, sizeof...(inputs)> loaded_inputs{};
for (const auto idx : c10::irange(sizeof...(inputs))) {
const auto& input_info = inputs_info[idx];
loaded_inputs[idx] = input_info.load_to_compute(
&input_info
Expand Down
40 changes: 20 additions & 20 deletions runtime/core/exec_aten/util/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
* dimension of all the tensors as the upper bound for the for loop.
*/
#define ET_CHECK_SAME_SHAPE2(a__, b__) \
({ \
do { \
const size_t a_numel__ = (a__).numel(); \
const size_t b_numel__ = (b__).numel(); \
const size_t a_dim__ = (a__).dim(); \
Expand All @@ -89,10 +89,10 @@
a_size__, \
b_size__); \
} \
})
} while (0)

#define ET_CHECK_SAME_SHAPE3(a__, b__, c__) \
({ \
do { \
const size_t a_numel__ = (a__).numel(); \
const size_t b_numel__ = (b__).numel(); \
const size_t c_numel__ = (c__).numel(); \
Expand Down Expand Up @@ -124,22 +124,22 @@
b_size__, \
c_size__); \
} \
})
} while (0)

/// Asserts that all tensors have the same dtype.
#define ET_CHECK_SAME_DTYPE2(a__, b__) \
({ \
do { \
const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \
const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \
ET_CHECK_MSG( \
a_type__ == b_type__, \
ET_TENSOR_CHECK_PREFIX__ ": dtype={%" PRId8 ", %" PRId8 "}", \
static_cast<int8_t>(a_type__), \
static_cast<int8_t>(b_type__)); \
})
} while (0)

#define ET_CHECK_SAME_DTYPE3(a__, b__, c__) \
({ \
do { \
const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \
const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \
const ::executorch::aten::ScalarType c_type__ = (c__).scalar_type(); \
Expand All @@ -150,7 +150,7 @@
static_cast<int8_t>(a_type__), \
static_cast<int8_t>(b_type__), \
static_cast<int8_t>(c_type__)); \
})
} while (0)

/**
* Asserts that all tensors have the same shape and dtype.
Expand All @@ -159,7 +159,7 @@
* macros independently, because it only calls ET_CHECK_MSG once.
*/
#define ET_CHECK_SAME_SHAPE_AND_DTYPE2(a__, b__) \
({ \
do { \
const size_t a_numel__ = (a__).numel(); \
const size_t b_numel__ = (b__).numel(); \
const size_t a_dim__ = (a__).dim(); \
Expand Down Expand Up @@ -189,10 +189,10 @@
a_size__, \
b_size__); \
} \
})
} while (0)

#define ET_CHECK_SAME_SHAPE_AND_DTYPE3(a__, b__, c__) \
({ \
do { \
const size_t a_numel__ = (a__).numel(); \
const size_t b_numel__ = (b__).numel(); \
const size_t c_numel__ = (c__).numel(); \
Expand Down Expand Up @@ -233,13 +233,13 @@
b_size__, \
c_size__); \
} \
})
} while (0)

/**
* Assert that the input tensor is contiguous tensor.
*/
#define ET_CHECK_CONTIGUOUS(a__) \
({ \
do { \
const ::executorch::aten::ArrayRef<executorch::aten::StridesType> \
strides = a__.strides(); \
const ::executorch::aten::ArrayRef<executorch::aten::StridesType> sizes = \
Expand All @@ -260,15 +260,15 @@
strides[i - 1], \
strides[i] * sizes[i]); \
} \
})
} while (0)

/**
* Assert the input two tensors share same strides.
* Noted that this function does not make any check or promise on the contiguity
* of any input tensors.
*/
#define ET_CHECK_SAME_STRIDES2(a__, b__) \
({ \
do { \
ET_CHECK_MSG( \
a__.dim() == b__.dim(), \
"Two tensors shall have same number of strides, but not %zu and %zu.", \
Expand All @@ -288,15 +288,15 @@
(int32_t)a_strides[i], \
(int32_t)b_strides[i]); \
} \
})
} while (0)

/**
* Assert the input three tensors share same strides.
* Noted that this function does not make any check or promise on the contiguity
* of any input tensors.
*/
#define ET_CHECK_SAME_STRIDES3(a__, b__, c__) \
({ \
do { \
ET_CHECK_MSG( \
a__.dim() == b__.dim() && b__.dim() == c__.dim(), \
"Three tensors shall have same number of strides, " \
Expand All @@ -322,17 +322,17 @@
(int32_t)b_strides[i], \
(int32_t)c_strides[i]); \
} \
})
} while (0)

#define ET_CHECK_DEFAULT_OR_CHANNELSLAST_DIMORDER(t__) \
({ \
do { \
ET_CHECK_MSG( \
is_contiguous_dim_order( \
t__.dim_order().data(), t__.dim_order().size()) || \
is_channels_last_dim_order( \
t__.dim_order().data(), t__.dim_order().size()), \
"Tensor must have default or channels last dim order"); \
})
} while (0)

/**
* DEPRECATED: Please use ET_CHECK_OR_RETURN_FALSE instead and provide
Expand Down
Loading