pytorch · swolchok · Mar 6, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
+
+namespace torch::executor {
+
+namespace internal {
+template <std::size_t kNumInputs>
+class BroadcastIndexesIterator {
+ public:
+  using difference_type = ssize_t;
+  using value_type = std::array<ssize_t, kNumInputs + 1>;
+  using reference = const value_type&;
+  using pointer = const value_type*;
+  using iterator_category = std::forward_iterator_tag;
+
+  BroadcastIndexesIterator() = default;
+
+  template <typename... Args>
+  explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
+      : output_dim_(output.dim()),
+        output_shape_(output.sizes()),
+        effective_input_broadcast_strides_{
+            effective_input_broadcast_stride(output, args)...} {
+    static_assert(
+        sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),
+        "BroadcastIndexesIterator constructor requires kNumInputs input tensor"
+        "arguments!");
+  }
+
+  struct make_end_t {
+    explicit constexpr make_end_t() = default;
+  };
+
+  template <typename... Args>
+  BroadcastIndexesIterator(make_end_t, const Tensor& t, const Args&... args)
+      : current_indexes_{
+            t.numel(),
+            0,
+        } {}
+
+  bool operator==(const BroadcastIndexesIterator& rhs) const {
+    return output_index() == rhs.output_index();
+  }
+
+  bool operator!=(const BroadcastIndexesIterator& rhs) const {
+    return !operator==(rhs);
+  }
+
+  reference operator*() const {
+    return current_indexes_;
+  }
+
+  pointer operator->() const {
+    return &current_indexes_;
+  }
+
+  BroadcastIndexesIterator& operator++() {
+    output_index()++;
+    // TODO: add optimization for particular input tensors not being
+    // broadcasted?
+    for (auto ii = output_dim_ - 1; ii >= 0; --ii) {
+      // You might wonder what happens if output_shape_[ii] == 0. In
+      // that case, output.numel() would be 0, and thus we would have
+      // begin() == end() and no iteration.
+      if ET_UNLIKELY (delinearized_output_index_[ii] == output_shape_[ii] - 1) {
+        const auto old_delinearized_output_index_item =
+            delinearized_output_index_[ii];
+        delinearized_output_index_[ii] = 0;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_[jj] -= old_delinearized_output_index_item *
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+      } else {
+        delinearized_output_index_[ii]++;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_.at(jj) +=
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+        break;
+      }
+    }
+    return *this;
+  }
+
+  BroadcastIndexesIterator operator++(int) {
+    auto it = *this;
+    operator++();
+    return it;
+  }
+
+  difference_type operator-(const BroadcastIndexesIterator& rhs) const {
+    return difference_type(output_index() - rhs.output_index());
+  }
+
+ private:
+  ssize_t output_index() const {
+    return current_indexes_[0];
+  }
+
+  ssize_t& output_index() {
+    return current_indexes_[0];
+  }
+
+  std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
+  effective_input_broadcast_stride(const Tensor& output, const Tensor& t)
+      const {
+    std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
+        result = {0};
+    ET_CHECK_MSG(
+        t.dim() <= output.dim(),
+        "input to broadcasting op should have dim at most output dim, but %d > %d!",
+        (int)t.dim(),
+        (int)output.dim());
+
+    const auto num_leading_ones = output.dim() - t.dim();
+    for (const auto idx : c10::irange(num_leading_ones)) {
+      result[idx] = 0;
+    }
+    const auto t_sizes = t.sizes();
+    const auto t_strides = t.strides();
+    for (const auto idx :
+         c10::irange(num_leading_ones, num_leading_ones + t.dim())) {
+      result[idx] = t_sizes[idx - num_leading_ones] == 1
+          ? 0
+          : t_strides[idx - num_leading_ones];
+    }
+    return result;
+  }
+
+  // The 0th entry is the current linear index into the output,
+  // followed by kNumInputs input indexes.
+  std::array<ssize_t, kNumInputs + 1> current_indexes_ = {0};
+  using ShapeType = std::
+      array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>;
+  ShapeType delinearized_output_index_ = {0};
+  ssize_t output_dim_;
+  ArrayRef<exec_aten::SizesType> output_shape_;
+  // The linear index for a broadcast tensor is
+  // sum(delinearized_output_index_[i] * input_stride_[i] if
+  // padded_input_shape_[i] != 1 else 0), where padded_input_shape is
+  // input.sizes() with leading 1s added to make its size equal to
+  // output_dim. This is straightforwardly implementable with an
+  // adjusted stride array that contains 0s where the padded input
+  // shape would contain 1s.
+  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_ = {
+      {{0}}};
+};
+} // namespace internal
+
+/**
+ * Efficient mechanism for looping over the index space for an output
+ * tensor and kNumInputs possibly-broadcasted input tensors. Use as follows:
+ *
+ * auto* output_data = output.mutable_data_ptr<OutputType>();
+ * const auto* a_data = a.mutable_data_ptr<AType>();
+ * const auto* b_data = b.mutable_data_ptr<BType>();
+ * for (const auto [output_index, a_index, b_index] :
+ *      BroadcastIndexesRange<2>(output, a, b)) {
+ *   // Access output_data[output_index], a_data[a_index], and b_data[b_index].
+ * }
+ *
+ * (where OutputType, AType, and BType are known concrete types.)
+ *
+ * Unlike looping using delinearize_index() and
+ * linearize_access_indexes(), BroadcastIndexesRange avoids expensive
+ * division and modulo operations on each iteration.
+ */
+template <std::size_t kNumInputs>
+class BroadcastIndexesRange {
+ public:
+  using iterator = internal::BroadcastIndexesIterator<kNumInputs>;
+
+  template <typename... Args>
+  BroadcastIndexesRange(const Tensor& output, const Args&... args)
+      : tensors_{&output, (&args)...} {}
+
+  iterator begin() const {
+    return std::apply(
+        [](const auto&... args) { return iterator((*args)...); }, tensors_);
+  }
+
+  iterator end() const {
+    return std::apply(
+        [](const auto&... args) {
+          return iterator(typename iterator::make_end_t(), (*args)...);
+        },
+        tensors_);
+  }
+
+ private:
+  std::array<const Tensor*, kNumInputs + 1> tensors_;
+};
+} // namespace torch::executor
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
@@ -290,23 +291,18 @@ inline void apply_binary_elementwise_fn(
   const CTYPE_B* const data_b = b.const_data_ptr<CTYPE_B>();
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
-  for (const auto i : c10::irange(out.numel())) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
+  if (any_is_broadcasted) {
+    for (const auto [out_index, a_index, b_index] :
+         BroadcastIndexesRange<2>(out, a, b)) {
+      data_out[out_index] = compute_fun(data_a[a_index], data_b[b_index]);
     }
+  } else {
+    for (const auto i : c10::irange(out.numel())) {
+      size_t a_linear_index = i;
+      size_t b_linear_index = i;
 
-    data_out[i] = compute_fun(data_a[a_linear_index], data_b[b_linear_index]);
+      data_out[i] = compute_fun(data_a[a_linear_index], data_b[b_linear_index]);
+    }
   }
 }
 
@@ -338,28 +334,16 @@ inline void apply_ternary_elementwise_fn(
   const CTYPE_C* const data_c = c.const_data_ptr<CTYPE_C>();
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
-  for (const auto i : c10::irange(out.numel())) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-    size_t c_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
-      if (c_is_broadcasted) {
-        c_linear_index = linearize_access_indexes(out_indexes, out.dim(), c);
-      }
+  if (any_is_broadcasted) {
+    for (const auto [out_index, a_index, b_index, c_index] :
+         BroadcastIndexesRange<3>(out, a, b, c)) {
+      data_out[out_index] =
+          compute_fun(data_a[a_index], data_b[b_index], data_c[c_index]);
+    }
+  } else {
+    for (const auto i : c10::irange(out.numel())) {
+      data_out[i] = compute_fun(data_a[i], data_b[i], data_c[i]);
     }
-
-    data_out[i] = compute_fun(
-        data_a[a_linear_index], data_b[b_linear_index], data_c[c_linear_index]);
   }
 }
 

@@ -9,6 +9,7 @@
 #pragma once
 
 #include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
@@ -121,26 +122,24 @@ inline void apply_bitensor_elementwise_fn(
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
 
   auto out_numel = out.numel();
-  for (const auto i : c10::irange(out_numel)) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
+  if (any_is_broadcasted) {
+    for (const auto [out_index, a_index, b_index] :
+         BroadcastIndexesRange<2>(out, a, b)) {
+      auto result = compute_fun(
+          load_a_to_common(&data_a[a_index * a_element_size]),
+          load_b_to_common(&data_b[b_index * b_element_size]));
+      store_common_to_out(result, &data_out[out_index * out_element_size]);
+    }
+  } else {
+    for (const auto i : c10::irange(out_numel)) {
+      size_t a_linear_index = i;
+      size_t b_linear_index = i;
+
+      auto result = compute_fun(
+          load_a_to_common(&data_a[a_linear_index * a_element_size]),
+          load_b_to_common(&data_b[b_linear_index * b_element_size]));
+      store_common_to_out(result, &data_out[i * out_element_size]);
     }
-
-    auto result = compute_fun(
-        load_a_to_common(&data_a[a_linear_index * a_element_size]),
-        load_b_to_common(&data_b[b_linear_index * b_element_size]));
-    store_common_to_out(result, &data_out[i * out_element_size]);
   }
 }
 
@@ -211,31 +210,27 @@ inline void apply_tritensor_elementwise_fn(
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
 
   auto out_numel = out.numel();
-  for (const auto i : c10::irange(out_numel)) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-    size_t c_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
-      if (c_is_broadcasted) {
-        c_linear_index = linearize_access_indexes(out_indexes, out.dim(), c);
-      }
+  if (any_is_broadcasted) {
+    for (const auto [out_index, a_index, b_index, c_index] :
+         BroadcastIndexesRange<3>(out, a, b, c)) {
+      auto result = compute_fun(
+          load_a_to_common(&data_a[a_index * a_element_size]),
+          load_b_to_common(&data_b[b_index * b_element_size]),
+          load_c_to_common(&data_c[c_index * c_element_size]));
+      store_common_to_out(result, &data_out[out_index * out_element_size]);
+    }
+  } else {
+    for (const auto i : c10::irange(out_numel)) {
+      size_t a_linear_index = i;
+      size_t b_linear_index = i;
+      size_t c_linear_index = i;
+
+      auto result = compute_fun(
+          load_a_to_common(&data_a[a_linear_index * a_element_size]),
+          load_b_to_common(&data_b[b_linear_index * b_element_size]),
+          load_c_to_common(&data_c[c_linear_index * c_element_size]));
+      store_common_to_out(result, &data_out[i * out_element_size]);
     }
-
-    auto result = compute_fun(
-        load_a_to_common(&data_a[a_linear_index * a_element_size]),
-        load_b_to_common(&data_b[b_linear_index * b_element_size]),
-        load_c_to_common(&data_c[c_linear_index * c_element_size]));
-    store_common_to_out(result, &data_out[i * out_element_size]);
   }
 }