Update

swolchok · swolchok · commit 37e42135a257 · 2025-03-07T16:10:47.000-08:00
[ghstack-poisoned]
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch {
@@ -47,30 +48,43 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (const auto out_ix : c10::irange(out.numel())) {
-      std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-          [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            // the below condition as written is equivalent to !isnan(accval) &&
-            // (isnan(v) || v < acc_val). cases:
-            // - if neither acc_val nor v is NaN, !(v >= acc_val) is
-            //   trivially equivalent to v < acc_val.
-            // - if acc_val is NaN, the whole thing is trivially false.
-            // - if acc_val is not NaN and v is NaN, then v >= acc_val
-            // - is false because all comparisons involving NaN are
-            // - false, so the result is true. The result is trivially
-            // - true for the above condition that uses isnan(v) as
-            // - well.
-            if (!std::isnan(acc_val) && !(v >= acc_val)) {
-              acc_val = v;
-              acc_ix = ix;
-            }
-            return std::tuple<CTYPE, long>{acc_val, acc_ix};
-          },
-          in,
-          dim,
-          out_ix);
-      out_data[out_ix] = std::get<1>(acc);
-    }
+    // REVIEW: this is the parallelization strategy ATen uses
+    // specifically when the reduction is along the last dimension and
+    // that dimension is contiguous. Is there any particular reason we
+    // shouldn't just always use this strategy since we aren't
+    // otherwise capable of parallelizing reductions?
+    const auto reduction_size =
+        dim.has_value() ? in.sizes().at(dim.value()) : in.numel();
+    const auto grain_size = std::max(
+        static_cast<int64_t>(1),
+        executorch::extension::internal::GRAIN_SIZE / reduction_size);
+    executorch::extension::parallel_for(
+        0, out.numel(), grain_size, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                  // the below condition as written is equivalent to
+                  // !isnan(accval) && (isnan(v) || v < acc_val). cases:
+                  // - if neither acc_val nor v is NaN, !(v >= acc_val) is
+                  //   trivially equivalent to v < acc_val.
+                  // - if acc_val is NaN, the whole thing is trivially false.
+                  // - if acc_val is not NaN and v is NaN, then v >= acc_val
+                  // - is false because all comparisons involving NaN are
+                  // - false, so the result is true. The result is trivially
+                  // - true for the above condition that uses isnan(v) as
+                  // - well.
+                  if (!std::isnan(acc_val) && !(v >= acc_val)) {
+                    acc_val = v;
+                    acc_ix = ix;
+                  }
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                },
+                in,
+                dim,
+                out_ix);
+            out_data[out_ix] = std::get<1>(acc);
+          }
+        });
   });
 
   return out;