Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 11 additions & 37 deletions kernels/portable/cpu/op__to_dim_order_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
#include <c10/util/irange.h>

#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand All @@ -31,47 +31,21 @@ using Optional = executorch::aten::optional<T>;

namespace {

// TODO(T179241236): Update core/exec_aten/util/tensor_util.h to support dim
// order other than contiguous.
int64_t coordinateToIndexWithDimOrder(
const Tensor& self,
const size_t* cur_indices) {
int64_t index = 0;
executorch::aten::StridesType strides[kTensorDimensionLimit];
SizesArrayRef sizes = self.sizes();
DimOrderArrayRef dim_order = self.dim_order();

dim_order_to_stride_nocheck(
sizes.data(), dim_order.data(), sizes.size(), strides);
for (const auto i : c10::irange(self.dim())) {
index += cur_indices[i] * strides[i];
}
return index;
}

template <typename SELF_CTYPE, typename OUT_CTYPE>
void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
auto out_data = out.mutable_data_ptr<OUT_CTYPE>();

size_t coordinate[kTensorDimensionLimit] = {0};

// Copy data from self to out index by index. Same index in self and out
// should have same value, no matter the order of dimensions.
for (ssize_t i = 0; i < self.numel(); i++) {
// Update the current indices.
for (ssize_t j = self.dim() - 1; j >= 0; j--) {
if (coordinate[j] + 1 < static_cast<size_t>(self.size(j))) {
coordinate[j]++;
break;
} else {
coordinate[j] = 0;
}
}
// Get the corresponding index of self_data and out_data by stride.
int64_t self_data_index = coordinateToIndexWithDimOrder(self, coordinate);
int64_t out_data_index = coordinateToIndexWithDimOrder(out, coordinate);

// Here we make a slightly off-label use of
// BroadcastIndexesRange. It always assumes it doesn't have to care
// about different dim_order between input and output, but we can
// just force it to respect strides (and thus dim_order) for its
// inputs using support_noncontiguous_input_tensors=true, and then pretend
// the output is just another input.
for (const auto [unused_index, self_data_index, out_data_index] :
BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
/*dummy output*/ self, self, out)) {
(void)unused_index;
out_data[out_data_index] =
static_cast<OUT_CTYPE>(self_data[self_data_index]);
}
Expand Down
2 changes: 1 addition & 1 deletion kernels/portable/cpu/op_glu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ Tensor& glu_out_tensor(
split_input.second_half,
utils::SupportedTensorDtypes::FLOATHBF16,
out,
utils::internal::SupportNoncontiguousTensors());
utils::internal::SupportNoncontiguousInputTensors());
});
return out;
}
Expand Down
25 changes: 16 additions & 9 deletions kernels/portable/cpu/util/broadcast_indexes_range.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ inline bool sizes_match_ignoring_leading_1s(
std::equal(lhs_begin, lhs_end, rhs_begin);
}

template <std::size_t kNumInputs, bool support_noncontiguous_tensors = false>
template <
std::size_t kNumInputs,
bool support_noncontiguous_input_tensors = false>
class BroadcastIndexesIterator {
public:
using difference_type = ssize_t;
Expand All @@ -57,7 +59,7 @@ class BroadcastIndexesIterator {
template <typename... Args>
explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
: output_dim_or_zero_if_no_broadcasting_(
!support_noncontiguous_tensors &&
!support_noncontiguous_input_tensors &&
(sizes_match_ignoring_leading_1s(
args.sizes(),
output.sizes()) &&
Expand All @@ -69,7 +71,7 @@ class BroadcastIndexesIterator {
sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),
"BroadcastIndexesIterator constructor requires kNumInputs input tensor"
"arguments!");
if (support_noncontiguous_tensors ||
if (support_noncontiguous_input_tensors ||
output_dim_or_zero_if_no_broadcasting_ != 0) {
effective_input_broadcast_strides_ = {
effective_input_broadcast_stride(output, args)...};
Expand Down Expand Up @@ -254,16 +256,21 @@ class BroadcastIndexesIterator {
* linearize_access_indexes(), BroadcastIndexesRange avoids expensive
* division and modulo operations on each iteration.
*
* The support_noncontiguous_tensors argument disables an optimization
* that causes the iterators not to respect strides in some
* cases. This optimization is normally safe because ExecuTorch
* tensors are contiguous.
* The support_noncontiguous_input_tensors argument disables an
* optimization that causes the iterators not to respect strides in
* some cases for input tensors. This optimization is normally safe
* because ExecuTorch tensors are contiguous. Non-contiguous output
* tensors are currently never supported (but note that this can be
* worked around by ignoring the output index and providing the true
* output as an extra input).
*/
template <std::size_t kNumInputs, bool support_noncontiguous_tensors = false>
template <
std::size_t kNumInputs,
bool support_noncontiguous_input_tensors = false>
class BroadcastIndexesRange {
public:
using iterator = internal::
BroadcastIndexesIterator<kNumInputs, support_noncontiguous_tensors>;
BroadcastIndexesIterator<kNumInputs, support_noncontiguous_input_tensors>;

template <typename... Args>
BroadcastIndexesRange(const Tensor& output, const Args&... args)
Expand Down
10 changes: 5 additions & 5 deletions kernels/portable/cpu/util/elementwise_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ namespace internal {
* strides; normally, this is not strictly necessary because ExecuTorch
* Tensors are contiguous.
*/
struct SupportNoncontiguousTensors {
explicit SupportNoncontiguousTensors() = default;
struct SupportNoncontiguousInputTensors {
explicit SupportNoncontiguousInputTensors() = default;
};

template <
Expand Down Expand Up @@ -292,7 +292,7 @@ inline void apply_unitensor_elementwise_fn(
const Tensor& a,
SupportedTensorDtypes a_dtypes,
const Tensor& out,
SupportNoncontiguousTensors) {
SupportNoncontiguousInputTensors) {
internal::apply_elementwise_fn<
CTYPE_COMPUTE,
op_name,
Expand Down Expand Up @@ -366,7 +366,7 @@ inline void apply_bitensor_elementwise_fn(
const Tensor& b,
SupportedTensorDtypes b_dtypes,
const Tensor& out,
SupportNoncontiguousTensors) {
SupportNoncontiguousInputTensors) {
internal::apply_elementwise_fn<
CTYPE_COMPUTE,
op_name,
Expand Down Expand Up @@ -467,7 +467,7 @@ inline void apply_tritensor_elementwise_fn(
const Tensor& c,
SupportedTensorDtypes c_dtypes,
const Tensor& out,
SupportNoncontiguousTensors) {
SupportNoncontiguousInputTensors) {
internal::apply_elementwise_fn<
CTYPE_COMPUTE,
op_name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1324,6 +1324,7 @@ ATEN_OPS = (
name = "op__to_dim_order_copy",
deps = [
":scalar_utils",
"//executorch/kernels/portable/cpu/util:broadcast_util",
"//executorch/kernels/portable/cpu/util:copy_ops_util",
],
),
Expand Down
Loading