[not for land] debug accuracy logging for float8 training

vkuzo · vkuzo · commit 2674d1dd5f49 · 2025-08-06T07:19:31.000-07:00
Summary: A lightweight logging flag to log the SQNR between the float8 gemm output and the bf16 gemm output. Test Plan: ```bash ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: f419ea2 ghstack-comment-id: 3160382784 Pull Request resolved: #2701
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -33,7 +33,10 @@
     e5m2_dtype,
 )
 from torchao.float8.float8_linear import Float8Linear
-from torchao.float8.float8_linear_utils import convert_to_float8_training
+from torchao.float8.float8_linear_utils import (
+    _populate_debug_fqns,
+    convert_to_float8_training,
+)
 from torchao.float8.float8_ops import addmm_float8_unwrapped
 from torchao.float8.float8_scaling_utils import (
     get_maybe_axiswise_dim,
@@ -400,6 +403,32 @@ def test_linear_from_recipe(
             config,
         )
 
+    @pytest.mark.parametrize(
+        "recipe_name",
+        [
+            Float8LinearRecipeName.TENSORWISE,
+            Float8LinearRecipeName.ROWWISE,
+            Float8LinearRecipeName.ROWWISE_WITH_GW_HP,
+        ],
+    )
+    def test_debug_logging(self, recipe_name):
+        x = torch.randn(1, 16, 16, device="cuda", dtype=torch.bfloat16)
+        m = nn.Sequential(
+            nn.Linear(16, 32, bias=False, device="cuda", dtype=torch.bfloat16),
+            nn.Sequential(
+                nn.ReLU(),
+                nn.Linear(32, 64, bias=False, device="cuda", dtype=torch.bfloat16),
+            ),
+        )
+        config = Float8LinearConfig.from_recipe_name(recipe_name)
+        object.__setattr__(config, "_enable_debug_logging", True)
+        m = convert_to_float8_training(m, config=config)
+        _populate_debug_fqns(m)
+        m = torch.compile(m)
+        y = m(x)
+        y.sum().backward()
+        # TODO(before land): actually test the values logged to stdout
+
     @pytest.mark.parametrize(
         "emulate", [True, False] if is_sm_at_least_89() else [True]
     )
diff --git a/torchao/float8/config.py b/torchao/float8/config.py
@@ -204,6 +204,12 @@ class Float8LinearConfig:
     # same value in the forward pass as the backward passes.
     round_scales_to_power_of_2: bool = False
 
+    # If True, captures accuracy debugging logging comparing high precision gemm
+    # outputs to their low precision versions, and outputs it to stdout
+    # Note: this flag is in prototype, has not been extensively tested and the
+    # API may change.
+    _enable_debug_logging: bool = False
+
     def __post_init__(self):
         # Populate the additional cast overrides, if the user did not specify them
         # Note: this hacks around the frozen-ness of this dataclass
diff --git a/torchao/float8/float8_linear.py b/torchao/float8/float8_linear.py
@@ -22,10 +22,19 @@
     LinearMMConfig,
     ScaledMMConfig,
 )
+from torchao.float8.float8_utils import compute_error
 from torchao.float8.fsdp_utils import WeightWithDynamicFloat8CastTensor
 
 
-@torch._dynamo.allow_in_graph
+@torch._dynamo.disable
+def log_sqnr(fqn, gemm_name, sqnr):
+    # TODO(future): use logging instead of print, will be more annoying to test
+    # with pytest
+    print(f"fqn: {fqn}, gemm_name: {gemm_name}, sqnr: {sqnr}")
+
+
+# note: need to remove torch._dynamo.allow_in_graph for logging to work with torch.compile
+# @torch._dynamo.allow_in_graph
 class matmul_with_hp_or_float8_args(torch.autograd.Function):
     """
     Like torch.matmul, but with the arguments in either high precision or float8.
@@ -41,10 +50,12 @@ def forward(
         weight_hp_t: torch.Tensor,
         linear_mm_config: LinearMMConfig,
         config: Float8LinearConfig,
+        debug_fqn: Optional[str],
     ):
         ctx.save_for_backward(input_hp, weight_hp_t)
         ctx.linear_mm_config = linear_mm_config
         ctx.config = config
+        ctx.debug_fqn = debug_fqn
 
         c = config
 
@@ -87,13 +98,21 @@ def forward(
         orig_shape = input_maybe_fp8.shape
         input_maybe_fp8_reshaped = input_maybe_fp8.reshape(-1, orig_shape[-1])
         res_bits = torch.mm(input_maybe_fp8_reshaped, weight_maybe_fp8_t)
+
+        if config._enable_debug_logging:
+            input_hp_reshaped = input_hp.reshape(-1, orig_shape[-1])
+            ref_result = torch.mm(input_hp_reshaped, weight_hp_t)
+            output_sqnr = compute_error(ref_result, res_bits)
+            log_sqnr(debug_fqn, "output", output_sqnr)
+
         res_bits = res_bits.reshape(*orig_shape[:-1], res_bits.shape[-1])
         return res_bits
 
     @staticmethod
     def backward(ctx, grad_output):
         input_hp, weight_hp_t = ctx.saved_tensors
         c = ctx.config
+        debug_fqn = ctx.debug_fqn
 
         # the reshapes are needed in order to make the shapes compatible with
         # torch.mm
@@ -144,6 +163,10 @@ def backward(ctx, grad_output):
             grad_output_reshaped_maybe_fp8_dim0,
             weight_t_maybe_fp8_dim0.t(),
         )
+        if c._enable_debug_logging:
+            ref_grad_input = torch.mm(grad_output_reshaped, weight_hp_t.t())
+            grad_input_sqnr = compute_error(ref_grad_input, grad_input)
+            log_sqnr(debug_fqn, "grad_input", grad_input_sqnr)
         grad_input = grad_input.reshape(
             *grad_output_orig_shape[:-1], grad_input.shape[-1]
         )
@@ -198,8 +221,17 @@ def backward(ctx, grad_output):
             grad_output_reshaped_maybe_fp8_dim1.t(),
             input_reshaped_maybe_fp8_dim1,
         )
+        if c._enable_debug_logging:
+            # don't log if this gemm is in high precision
+            this_gemm_is_hp = (
+                c.cast_config_input_for_grad_weight.scaling_type is ScalingType.DISABLED
+            )
+            if not this_gemm_is_hp:
+                ref_grad_weight = torch.mm(grad_output_reshaped.t(), input_hp_reshaped)
+                grad_weight_sqnr = compute_error(ref_grad_weight, grad_weight)
+                log_sqnr(debug_fqn, "grad_weight", grad_weight_sqnr)
 
-        empty_grads = None, None
+        empty_grads = None, None, None
 
         return grad_input, grad_weight.t(), *empty_grads
 
@@ -252,6 +284,10 @@ def __init__(self, *args, **kwargs):
             ),
         )
 
+        # debugging only, API may change at any time. This is expected to be
+        # set by the user in a separate API call.
+        self._debug_fqn: Optional[str] = None
+
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         # Duplicate the autocast logic for F.linear, so that the output
         # of our module has the right original precision
@@ -266,6 +302,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
             self.weight.t(),
             self.linear_mm_config,
             self.config,
+            self._debug_fqn,
         )
 
         if self.bias is not None:
diff --git a/torchao/float8/float8_linear_utils.py b/torchao/float8/float8_linear_utils.py
@@ -196,3 +196,9 @@ def _auto_filter_for_tensorwise(
     if K <= 4096 and N <= 1024:
         return False
     return True
+
+
+def _populate_debug_fqns(model: nn.Module):
+    for name, mod in model.named_modules():
+        if isinstance(mod, Float8Linear):
+            mod._debug_fqn = name