no_grad fix (#179)

zou3519 · web-flow · commit 17f407c9faea · 2021-10-05T12:59:54.000-04:00
Fixes #13 Case 1: grad gets called inside torch.no_grad. - grad should ignore torch.no_grad because it's "creating a new level of autograd above the current level" - Another way to think about this is that grad(f) is a "function transform": its result should not be affected by context managers that are outside of the function f Case 2: torch.no_grad gets called inside `grad` - grad should respect torch.no_grad See NOTE [grad and vjp interaction with no_grad] for implementation strategy. It unfortunately involves a mode. Test Plan: - Many tests
diff --git a/functorch/_src/eager_transforms.py b/functorch/_src/eager_transforms.py
@@ -88,27 +88,75 @@ def _autograd_grad(outputs, inputs, grad_outputs=None, retain_graph=False, creat
                         for gi, inp in zip(grad_inputs, inputs))
     return grad_inputs
 
+# NOTE [grad and vjp interaction with no_grad]
+#
+# def f(x):
+#   with torch.no_grad():
+#     c = x ** 2
+#   return x - c
+#
+# The thing to consider is if enable_grad is on/off before grad gets called.
+#
+# Case 1: enable_grad is on.
+# grad(f)(x)
+# In this case, `grad` should respect the inner torch.no_grad.
+#
+# Case 2: enable_grad is off
+# with torch.no_grad():
+#   grad(f)(x)
+# In this case, `grad` should respect the inner torch.no_grad, but not the
+# outer one. This is because `grad` is a "function transform": its result
+# should not depend on the result of a context manager outside of `f`.
+#
+# This gives us the following desired behavior:
+# - (nested) grad transforms must obey torch.no_grad inside them
+# - (nested) grad transforms should not obey torch.no_grad outside them
+#
+# To achieve this behavior, upon entering grad/vjp:
+# - we save the current ("previous") is_grad_enabled (*)
+# - we unconditionally enable grad.
+#
+# Inside DynamicLayerBackFallback, when we're temporarily popping `grad` layer
+# off the stack:
+# - if grad_mode is disabled, then we do nothing. (there is a torch.no_grad
+#   active, all subsequent grad transforms must obey it).
+# - if grad_mode is enabled, and the previous is_grad_enabled (*) is False,
+#   then we temporarily restore the previous `is_grad_enabled`. This is
+#   because we're crossing the boundary from a `grad` outside the
+#   no_grad to a `grad` inside the no_grad.
+#
+# NB: vjp has some interesting behavior because the vjp's callable can be called
+# under a different grad_mode than the forward computation...
+#
+# TODO: forward-mode AD: does it also respect no_grad? What does that mean
+# for our jvp transform?
+
+
 # How do we increment and decrement the nesting? I don't think we can.
 def vjp(f, *primals):
     level = _grad_increment_nesting()
     try:
-        primals = _wrap_all_tensors(primals, level)
-        diff_primals = _create_differentiable(primals, level)
-        primals_out = f(*diff_primals)
-
-        results = _undo_create_differentiable(primals_out, level)
-        flat_diff_primals, primals_spec = tree_flatten(diff_primals)
-        flat_primals_out, primals_out_spec = tree_flatten(primals_out)
-
-        for primal_out in flat_primals_out:
-            assert isinstance(primal_out, torch.Tensor)
-            if primal_out.is_floating_point() or primal_out.is_complex():
-                continue
-            raise RuntimeError("vjp(f, ...): All outputs of f must be "
-                               "floating-point or complex Tensors, got Tensor "
-                               f"with dtype {primal_out.dtype}")
-
-        def wrapper(cotangents, retain_graph=True, create_graph=True):
+        # See NOTE [grad and vjp interaction with no_grad]
+        with torch.enable_grad():
+            primals = _wrap_all_tensors(primals, level)
+            diff_primals = _create_differentiable(primals, level)
+            primals_out = f(*diff_primals)
+
+            results = _undo_create_differentiable(primals_out, level)
+            flat_diff_primals, primals_spec = tree_flatten(diff_primals)
+            flat_primals_out, primals_out_spec = tree_flatten(primals_out)
+
+            for primal_out in flat_primals_out:
+                assert isinstance(primal_out, torch.Tensor)
+                if primal_out.is_floating_point() or primal_out.is_complex():
+                    continue
+                raise RuntimeError("vjp(f, ...): All outputs of f must be "
+                                   "floating-point or complex Tensors, got Tensor "
+                                   f"with dtype {primal_out.dtype}")
+
+        def wrapper(cotangents, retain_graph=True, create_graph=None):
+            if create_graph is None:
+                create_graph = torch.is_grad_enabled()
             flat_cotangents, cotangents_spec = tree_flatten(cotangents)
             if primals_out_spec != cotangents_spec:
                 raise RuntimeError(
@@ -236,30 +284,32 @@ def wrapper(*args, **kwargs):
         level = _grad_increment_nesting()
         output, aux, grad_input = None, None, None
         try:
-            args = _wrap_all_tensors(args, level)
-            kwargs = _wrap_all_tensors(kwargs, level)
-            diff_args = _slice_argnums(args, argnums)
-            tree_map_(partial(_create_differentiable, level=level), diff_args)
-
-            output = f(*args, **kwargs)
-            if has_aux:
-                output, aux = output
-
-            if not isinstance(output, torch.Tensor):
-                raise RuntimeError('grad_and_value(f)(*args): Expected f(*args)'
-                                   f'to return a Tensor, got {type(output)}')
-            if output.dim() != 0:
-                raise RuntimeError('grad_and_value(f)(*args): Expected f(*args)'
-                                   'to return a scalar Tensor, got tensor with '
-                                   f'{output.dim()} dims. Maybe you wanted to'
-                                   'use the vjp or jacrev APIs instead?')
-
-            flat_diff_args, spec = tree_flatten(diff_args)
-
-            # NB: need create_graph so that backward pass isn't run in no_grad mode
-            flat_outputs = _as_tuple(output)
-            flat_grad_input = _autograd_grad(flat_outputs, flat_diff_args, create_graph=True)
-            grad_input = tree_unflatten(flat_grad_input, spec)
+            # See NOTE [grad and vjp interaction with no_grad]
+            with torch.enable_grad():
+                args = _wrap_all_tensors(args, level)
+                kwargs = _wrap_all_tensors(kwargs, level)
+                diff_args = _slice_argnums(args, argnums)
+                tree_map_(partial(_create_differentiable, level=level), diff_args)
+
+                output = f(*args, **kwargs)
+                if has_aux:
+                    output, aux = output
+
+                if not isinstance(output, torch.Tensor):
+                    raise RuntimeError('grad_and_value(f)(*args): Expected f(*args)'
+                                       f'to return a Tensor, got {type(output)}')
+                if output.dim() != 0:
+                    raise RuntimeError('grad_and_value(f)(*args): Expected f(*args)'
+                                       'to return a scalar Tensor, got tensor with '
+                                       f'{output.dim()} dims. Maybe you wanted to'
+                                       'use the vjp or jacrev APIs instead?')
+
+                flat_diff_args, spec = tree_flatten(diff_args)
+
+                # NB: need create_graph so that backward pass isn't run in no_grad mode
+                flat_outputs = _as_tuple(output)
+                flat_grad_input = _autograd_grad(flat_outputs, flat_diff_args, create_graph=True)
+                grad_input = tree_unflatten(flat_grad_input, spec)
 
         finally:
             if grad_input is not None:
diff --git a/functorch/csrc/DynamicLayer.cpp b/functorch/csrc/DynamicLayer.cpp
@@ -33,7 +33,7 @@ class DynamicLayerStackHolder : public c10::DebugInfoBase {
   DynamicLayerStackHolder() {}
   virtual ~DynamicLayerStackHolder() {}
 
-  std::vector<DynamicLayer> dynamicLayerStack = { DynamicLayer(DispatchKey::Autograd, 1) };
+  std::vector<DynamicLayer> dynamicLayerStack = { DynamicLayer(DispatchKey::Autograd, 1, nullopt, true) };
 };
 
 thread_local std::shared_ptr<DynamicLayerStackHolder> kDynamicLayerStack;
@@ -117,13 +117,16 @@ static int64_t pushDynamicLayer(DynamicLayer&& dynamic_layer) {
   return layerId;
 }
 
-static int64_t pushDynamicLayer(DispatchKey key, optional<int64_t> batch_size = nullopt) {
+static int64_t pushDynamicLayer(
+    DispatchKey key,
+    optional<int64_t> batch_size = nullopt,
+    optional<bool> prev_grad_mode = nullopt) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   TORCH_INTERNAL_ASSERT(key != DispatchKey::Undefined);
   TORCH_INTERNAL_ASSERT(key != DispatchKey::Batched);
 
   auto layerId = 1 + dynamicLayerStack.size();
-  dynamicLayerStack.emplace_back(key, layerId, batch_size);
+  dynamicLayerStack.emplace_back(key, layerId, batch_size, prev_grad_mode);
 
   if (layerId == 2) {
     // std::cout << "DynamicLayer on" << std::endl;
@@ -134,10 +137,16 @@ static int64_t pushDynamicLayer(DispatchKey key, optional<int64_t> batch_size =
   return layerId;
 }
 
-int64_t initAndPushDynamicLayer(DispatchKey key, optional<int64_t> batch_size) {
-  auto layerId = pushDynamicLayer(key, batch_size);
+int64_t initAndPushDynamicLayer(
+    DispatchKey key,
+    optional<int64_t> batch_size,
+    optional<bool> prev_grad_mode) {
+  auto layerId = pushDynamicLayer(key, batch_size, prev_grad_mode);
   auto& data = getGlobalDynmetaData();
   TORCH_INTERNAL_ASSERT(data.find(layerId) == data.end());
+  if (key == DispatchKey::Autograd) {
+    TORCH_INTERNAL_ASSERT(prev_grad_mode.has_value());
+  }
   data[layerId] = std::make_shared<bool>(true);
   return layerId;
 }
@@ -374,7 +383,6 @@ struct WithoutTop {
     pushDynamicLayer(std::move(layer_));
   }
 
-  bool prev_grad_enabled_;
   DynamicLayer layer_;
 };
 
@@ -394,6 +402,11 @@ void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack*
   auto cur_level = getDynamicLayerStack().back().layerId();
   auto cur_key = getDynamicLayerStack().back().key();
 
+  optional<bool> prev_grad_mode = getDynamicLayerStack().back().prevGradMode();
+  if (cur_key == DispatchKey::Autograd) {
+    TORCH_INTERNAL_ASSERT(prev_grad_mode.has_value());
+  }
+
   auto unwrap = [&](const Tensor& tensor) {
     if (!tensor.defined()) {
       return tensor;
@@ -457,7 +470,13 @@ void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack*
   c10::impl::tls_set_dispatch_key_included(kDynamicLayerBackModeKey, true);
 
   // Re-dispatch
-  op.callBoxed(stack);
+  if (cur_key == DispatchKey::Autograd && *prev_grad_mode == false) {
+    // See NOTE [grad and vjp interaction with no_grad]
+    c10::AutoGradMode guard(*prev_grad_mode);
+    op.callBoxed(stack);
+  } else {
+    op.callBoxed(stack);
+  }
 
   // Step 4, 5, 6
   if (cur_key == DispatchKey::Autograd) {
diff --git a/functorch/csrc/DynamicLayer.h b/functorch/csrc/DynamicLayer.h
@@ -17,21 +17,43 @@ namespace at {
 namespace functorch {
 
 struct TORCH_API DynamicLayer {
-  DynamicLayer(DispatchKey key, int64_t layerId, optional<int64_t> batchSize = nullopt): key_(key), layerId_(layerId), batchSize_(batchSize) {}
+  DynamicLayer(
+      DispatchKey key,
+      int64_t layerId,
+      optional<int64_t> batchSize = nullopt,
+      optional<bool> prev_grad_mode = nullopt):
+    key_(key), layerId_(layerId), batchSize_(batchSize), prevGradMode_(prev_grad_mode)
+  {
+    if (key_ == DispatchKey::Autograd) {
+      TORCH_INTERNAL_ASSERT(prev_grad_mode.has_value());
+    }
+  }
 
   DispatchKey key() const { return key_; }
   int64_t layerId() const { return layerId_; }
+  // Only valid for vmap
   int64_t batchSize() const {
     TORCH_INTERNAL_ASSERT(batchSize_);
     return *batchSize_;
   }
+  // only valid for grad-based transforms
+  optional<bool> prevGradMode() const {
+    return prevGradMode_;
+  }
  private:
   DispatchKey key_;
   int64_t layerId_;
+
+  // Honestly these should be a union or some extendable metadata class.
+  // Not doing that for now because I don't think we'll use this mechanism for very long.
   optional<int64_t> batchSize_;
+  optional<bool> prevGradMode_;
 };
 
-TORCH_API int64_t initAndPushDynamicLayer(DispatchKey key, optional<int64_t> batch_size = nullopt);
+TORCH_API int64_t initAndPushDynamicLayer(
+    DispatchKey key,
+    optional<int64_t> batch_size = nullopt,
+    optional<bool> prev_grad_mode = nullopt);
 TORCH_API DynamicLayer popDynamicLayerAndDeleteMetadata();
 TORCH_API c10::optional<DynamicLayer> maybeCurrentDynamicLayer();
 TORCH_API const std::vector<DynamicLayer>& getDynamicLayerStack();
diff --git a/functorch/csrc/init.cpp b/functorch/csrc/init.cpp
@@ -158,7 +158,9 @@ bool dump_tensor(const Tensor& self) {
 }
 
 int64_t _grad_increment_nesting() {
-  return initAndPushDynamicLayer(at::DispatchKey::Autograd);
+  // See NOTE [grad and vjp interaction with no_grad]
+  bool prev_grad_mode = c10::GradMode::is_enabled();
+  return initAndPushDynamicLayer(at::DispatchKey::Autograd, nullopt, prev_grad_mode);
 }
 
 int64_t _grad_decrement_nesting() {
diff --git a/test/test_eager_transforms.py b/test/test_eager_transforms.py

Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,9 @@ bool dump_tensor(const Tensor& self) {`
`158`	`158`	`}`
`159`	`159`
`160`	`160`	`int64_t _grad_increment_nesting() {`
`161`		`- return initAndPushDynamicLayer(at::DispatchKey::Autograd);`
	`161`	`+ // See NOTE [grad and vjp interaction with no_grad]`
	`162`	`+ bool prev_grad_mode = c10::GradMode::is_enabled();`
	`163`	`+ return initAndPushDynamicLayer(at::DispatchKey::Autograd, nullopt, prev_grad_mode);`
`162`	`164`	`}`
`163`	`165`
`164`	`166`	`int64_t _grad_decrement_nesting() {`