Make one of the boxed fallbacks faster

zou3519 · zou3519 · commit 731e05f150d3 · 2021-10-12T16:27:46.000-07:00
We should do this for the others
diff --git a/functorch/csrc/BatchRulesHelper.h b/functorch/csrc/BatchRulesHelper.h
@@ -179,77 +179,80 @@ inline void handle_variadic_bdims(std::vector<std::pair<Tensor, optional<int64_t
 #define VARIADIC_BDIMS_BOXED(op) \
   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_variadic_bdims), &handle_variadic_bdims>>());
 
+using UnpackedBatchedTensor = std::tuple<Tensor,optional<int64_t>>;
+
+inline void find_and_unpack_tensors(
+    const torch::jit::Stack* stack,
+    int64_t num_args,
+    int64_t cur_level,
+    SmallVector<UnpackedBatchedTensor, 5>* tensors,
+    SmallVector<int64_t, 5>* tensors_pos,
+    int64_t* batch_size) {
+
+  int64_t computed_batch_size = -1;
+  int64_t args_begin = stack->size() - num_args;
+
+  for (const auto idx : c10::irange(0, num_args)) {
+    const auto& ivalue = (*stack)[args_begin + idx];
+    if (!ivalue.isTensor()) {
+      continue;
+    }
+    auto unpacked = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
+    const auto& tensor_value = std::get<0>(unpacked);
+    const auto tensor_bdim = std::get<1>(unpacked);
+    if (tensor_bdim.has_value()) {
+      auto candidate_batch_size = tensor_value.size(*tensor_bdim);
+      if (computed_batch_size == -1) {
+        computed_batch_size = candidate_batch_size;
+      }
+      TORCH_INTERNAL_ASSERT(candidate_batch_size == computed_batch_size);
+    }
+
+    tensors->push_back(std::move(unpacked));
+    tensors_pos->push_back(idx);
+  }
+  TORCH_INTERNAL_ASSERT(computed_batch_size > -1);
+  *batch_size = computed_batch_size;
+}
+
 inline void boxed_existing_bdim_all_batch_rule(
     const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   const auto& schema = op.schema();
   const auto num_returns = schema.returns().size();
   const auto num_arguments = schema.arguments().size();
-  auto arguments = torch::jit::pop(*stack, num_arguments);
 
   c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
 
-  std::vector<std::pair<Tensor, optional<int64_t>>> tensor_inputs;
-  std::vector<int64_t> tensor_pos;
-  for (const auto idx : c10::irange(0, num_arguments)) {
-    const auto& ivalue = arguments[idx];
-    if (!ivalue.isTensor()) {
-      continue;
-    }
-    Tensor tensor_value;
-    optional<int64_t> tensor_bdim;
-    std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
-    tensor_inputs.push_back(std::make_pair(tensor_value, tensor_bdim));
-    tensor_pos.push_back(idx);
-  }
+  int64_t args_begin = stack->size() - num_arguments;
+  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
+  SmallVector<int64_t, 5> tensor_pos;
+  int64_t batch_size;
 
-  // compute batch size...
-  int64_t batch_size = -1;
-  for (const auto& tensor_input : tensor_inputs) {
-    const auto& value = tensor_input.first;
-    const auto& bdim = tensor_input.second;
-    if (!bdim) {
-      continue;
-    }
-    if (batch_size == -1) {
-      batch_size = value.size(*bdim);
-    }
-    TORCH_INTERNAL_ASSERT(batch_size == value.size(*bdim));
-  }
+  find_and_unpack_tensors(
+      stack, num_arguments, cur_level,
+      &tensor_inputs, &tensor_pos, &batch_size);
 
   // for each tensor, ensure it has a bdim and reshape it.
-  for (auto& tensor_input : tensor_inputs) {
-    auto value = tensor_input.first;
-    auto bdim = tensor_input.second;
-    value = ensure_has_bdim(value, bdim.has_value(), batch_size);
+  for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) {
+    const auto& value = std::get<0>(tensor_inputs[tensor_idx]);
+    auto bdim = std::get<1>(tensor_inputs[tensor_idx]);
+    auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size);
     if (!bdim.has_value()) {
       bdim = 0;
     }
-    tensor_input.first = reshape_dim_into(*bdim, 0, value);
-  }
-
-  size_t tensor_idx = 0;
-  TORCH_INTERNAL_ASSERT(tensor_pos.size() > 0);
-  for (const auto arg_idx : c10::irange(0, num_arguments)) {
-    if (tensor_idx >= tensor_pos.size() || (int64_t)arg_idx != tensor_pos[tensor_idx]) {
-      torch::jit::push(stack, arguments[arg_idx]);
-    } else {
-      TORCH_INTERNAL_ASSERT(tensor_idx < tensor_inputs.size());
-      torch::jit::push(stack, tensor_inputs[tensor_idx].first);
-      tensor_idx++;
-    }
+    (*stack)[args_begin + tensor_pos[tensor_idx]] = reshape_dim_into(*bdim, 0, value_);
   }
 
   op.callBoxed(stack);
-  const auto returns = torch::jit::pop(*stack, num_returns);
-  for (const auto& ret : returns) {
-    if (ret.isTensor()) {
-      torch::jit::push(stack, makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level));
-    } else {
-      TORCH_INTERNAL_ASSERT(false, "This boxed batching rule does not currently support ops that return non-tensor values");
-    }
+
+  for (const auto idx : c10::irange(args_begin, args_begin + num_returns)) {
+    const auto& ret = (*stack)[idx];
+    TORCH_INTERNAL_ASSERT(ret.isTensor(),
+        "This boxed batching rule does not currently support ops that return non-tensor values");
+    (*stack)[idx] = makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level);
   }
 }