From 0127eba3a8e66af65bee9c15cd48d2e4a6b34f99 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 12:56:46 +0000 Subject: [PATCH 1/2] Initial plan From 02139c476a6698479d9ce677f2fb8474bb1f1942 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:08:43 +0000 Subject: [PATCH 2/2] Fix gradient mismatch in checkpointing by preserving node metadata The issue was that when converting torch operators to thunder operators in checkpointed functions, the node metadata (containing requires_grad, dtype, shape, etc.) was not being copied. This caused gradient computation issues during backward pass, especially on B200/GB200 GPUs where numerical precision is critical. The fix adds a single line to copy metadata from original nodes to the new thunder nodes, preserving all tensor properties necessary for correct gradient computation. Co-authored-by: IvanYashchuk <19621411+IvanYashchuk@users.noreply.github.com> --- thunder/dynamo/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/thunder/dynamo/utils.py b/thunder/dynamo/utils.py index 3c58bc2ded..a30e9ecc2e 100644 --- a/thunder/dynamo/utils.py +++ b/thunder/dynamo/utils.py @@ -650,6 +650,9 @@ def _checkpoint_function_converter(gm: torch.fx.GraphModule): thunder_node = gm.graph.call_function( _torch_to_thunder_function_map[n.target], args=n.args, kwargs=n.kwargs ) + # Copy metadata from the original node to preserve tensor properties like + # requires_grad, dtype, shape, etc. which are crucial for gradient computation + thunder_node.meta = n.meta.copy() n.replace_all_uses_with(thunder_node) gm.graph.erase_node(n) else: