Updated comments

mattteochen · mattteochen · commit 36f8a4a30e0c · 2025-12-18T09:57:09.000Z
diff --git a/thunder/core/jit_ext.py b/thunder/core/jit_ext.py
@@ -941,8 +941,6 @@ def _generate_random_str_id() -> str:
         return "".join(secrets.choice(string.ascii_lowercase) for _ in range(length))
 
     # Support both stable PyTorch (with args_tensor_mask) and nightly (without it)
-    # See changelog.md for details on the args_tensor_mask removal in nightly PyTorch
-    # Note: Use "in" check rather than .get() to handle wrapped values correctly
     if "args_tensor_mask" in fwd_kwargs:
         args_tensor_mask = unwrap(fwd_kwargs["args_tensor_mask"])
     else:
@@ -972,8 +970,8 @@ def _generate_random_str_id() -> str:
         # With args_tensor_mask, the fwd_body expects ctx as first argument
         new_fwd_args = (wrap_const(None),) + tuple(new_fwd_args)
     else:
-        # For nightly PyTorch without args_tensor_mask, the ctx handling is internalized
-        # by dynamo. The fwd_body GraphModule does NOT expect a ctx argument.
+        # For nightly PyTorch without args_tensor_mask, the fwd_body 
+        # GraphModule does NOT expect a ctx argument.
         # We pass all args as-is without prepending None.
         new_fwd_args = tuple(fwd_args)
     unwrapped_fwd_args = tree_map(lambda t: unwrap(t), new_fwd_args)
@@ -1014,8 +1012,6 @@ def forward(*args, **kwargs):
     grads = sequencify(tree_map(lambda t: TensorProxy(like=t), sequencify(output)))
     bwd_tensor_args = grads + tuple(saved_values)
     # Support both stable PyTorch (with args_tensor_mask) and nightly (without it)
-    # With args_tensor_mask, bwd_body expects ctx as first argument
-    # Without args_tensor_mask, ctx handling is internalized - no ctx argument needed
     if args_tensor_mask is not None:
         bwd_args = (None,) + bwd_tensor_args
     else:
@@ -1050,11 +1046,11 @@ def grad_transform(*args, **kwargs):
         # Support both stable PyTorch (with args_tensor_mask) and nightly (without it)
         if args_tensor_mask is not None:
             bwd_args = (None,) + tuple(grads) + tuple(sequencify(residuals))
-            # Old API: first arg is ctx, skip it for put_grads
+            # Stable PT: first arg is ctx, skip it for put_grads
             grad_inputs = args[1:]
         else:
             bwd_args = tuple(grads) + tuple(sequencify(residuals))
-            # New API: no ctx, use all args
+            # Nightly PT: no ctx, use all args
             grad_inputs = args
         result = interpret_trace(aliased_bwd_trace, *bwd_args)
         put_grads(grad_inputs, result)
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
@@ -14,10 +14,15 @@
 
 import thunder
 
+from thunder.tests.framework import requiresCUDA, IS_WINDOWS
+from thunder.core.options import CACHE_OPTIONS
+import thunder.core.prims as prims
+from thunder import pytorch_executor, nvfuser_executor
+from thunder.executors.sdpaex import sdpa_ex
+from thunder.core.transforms import Transform
 
 # Detect once at module load time whether PyTorch uses args_tensor_mask.
-# This must be done outside the JIT-traced function to avoid interpreter issues
-# with inspect.getsource() and tokenize internals.
+# This must be done outside the JIT-traced function to avoid interpreter issues.
 def _detect_has_args_tensor_mask():
     """Check if autograd_function_apply uses args_tensor_mask.
 
@@ -46,14 +51,6 @@ def _autograd_function_apply_kwargs(args_tensor_mask, non_differentiable_idx=Non
     return kwargs
 
 
-from thunder.tests.framework import requiresCUDA, IS_WINDOWS
-from thunder.core.options import CACHE_OPTIONS
-import thunder.core.prims as prims
-from thunder import pytorch_executor, nvfuser_executor
-from thunder.executors.sdpaex import sdpa_ex
-from thunder.core.transforms import Transform
-
-
 thunder_jit = partial(thunder.jit, debug_options=thunder.DebugOptions(check_traces=2))
 
 #
@@ -1292,8 +1289,9 @@ def test_autograd_function_apply():
     # since https://github.com/pytorch/pytorch/pull/169528 `torch.ops.higher_order.autograd_function_apply`
     # no longer accepts simple callables, but rather `torch.fx.GraphModule`s.
 
+    # TODO: Remove this once this autograd API becomes stable.
     # On stable PyTorch (with args_tensor_mask), forward/backward expect ctx as first arg.
-    # On nightly PyTorch (without args_tensor_mask), ctx handling is internalized.
+    # On nightly PyTorch (without args_tensor_mask), ctx is not an argument.
     if _HAS_ARGS_TENSOR_MASK:
 
         class FwdModule(torch.nn.Module):
@@ -1341,6 +1339,9 @@ def my_sin(x):
     expect_grad = torch.autograd.grad(y_ref, x_ref, grad)
     torch.testing.assert_close(actual_grad, expect_grad)
 
+    # TODO: Remove this once this autograd API becomes stable.
+    # On stable PyTorch (with args_tensor_mask), forward/backward expect ctx as first arg.
+    # On nightly PyTorch (without args_tensor_mask), ctx is not an argument.
     if _HAS_ARGS_TENSOR_MASK:
 
         class WrongBwdModule(torch.nn.Module):
@@ -1383,8 +1384,9 @@ def my_sin_with_wrong_backward(x):
 
 def test_autograd_function_apply_with_no_grad():
     # This case is using `torch` operations
+    # TODO: Remove this once this autograd API becomes stable.
     # On stable PyTorch (with args_tensor_mask), forward/backward expect ctx as first arg.
-    # On nightly PyTorch (without args_tensor_mask), ctx handling is internalized.
+    # On nightly PyTorch (without args_tensor_mask), ctx is not an argument.
     if _HAS_ARGS_TENSOR_MASK:
 
         def forward(_, x):
@@ -1429,6 +1431,9 @@ def my_sin(x):
 
     # This is using `thunder` operations
     # NOTE - This takes a different codepath compared to above.
+    # TODO: Remove this once this autograd API becomes stable.
+    # On stable PyTorch (with args_tensor_mask), forward/backward expect ctx as first arg.
+    # On nightly PyTorch (without args_tensor_mask), ctx is not an argument.
     if _HAS_ARGS_TENSOR_MASK:
 
         def forward(_, x):
diff --git a/thunder/torch/__init__.py b/thunder/torch/__init__.py
@@ -6716,9 +6716,9 @@ def autograd_function_apply(
     args_tensor_mask: Sequence[bool] | None = None,
     non_differentiable_idx: Sequence[int] | None = None,
 ) -> TensorProxy | tuple[TensorProxy, ...]:
-    # Support both stable PyTorch (with args_tensor_mask) and nightly (without it)
-    # With args_tensor_mask, fwd expects ctx as first argument
-    # Without args_tensor_mask, ctx handling is internalized - no ctx argument needed
+    # TODO: Remove this once this autograd API becomes stable.
+    # On stable PyTorch, fwd expects ctx as first argument
+    # On nightly PyTorch, ctx is not an argument
     if args_tensor_mask is not None:
         result, saved_for_backward = call_higher_order_function_and_consider_outer_autograd_setting(fwd)(None, *args)
     else:
@@ -6734,7 +6734,9 @@ def augmented_forward_autograd_function_apply(
     args_tensor_mask: Sequence[bool] | None = None,
     non_differentiable_idx: Sequence[int] | None = None,
 ) -> tuple[TensorProxy | tuple[TensorProxy, ...], tuple[Any, ...]]:
-    # Support both stable PyTorch (with args_tensor_mask) and nightly (without it)
+    # TODO: Remove this once this autograd API becomes stable.
+    # On stable PyTorch, fwd expects ctx as first argument
+    # On nightly PyTorch, ctx is not an argument
     if args_tensor_mask is not None:
         result, saved_for_backward = fwd(None, *args)
     else:
@@ -6750,7 +6752,9 @@ def backward_autograd_function_apply(
     non_differentiable_idx: Sequence[int] | None = None,
     *grad_output: Sequence[TensorProxy],
 ) -> tuple[Any, ...]:
-    # Support both stable PyTorch (with args_tensor_mask) and nightly (without it)
+    # TODO: Remove this once this autograd API becomes stable.
+    # On stable PyTorch, fwd expects ctx as first argument
+    # On nightly PyTorch, ctx is not an argument
     if args_tensor_mask is not None:
         return bwd(None, *grad_output, *saved_for_backward)
     else: