[release/2.8] layernorm tests: Tweak test thresholds for comparing tensors (#2583)

dnikolaev-amd · ahmadsharif1 · web-flow · commit 6aaabc1862fd · 2025-08-27T11:23:49.000-05:00
After PR: pytorch#156600, this test was failing internally on large tensors because the differences were greater than tolerances on some cuda devices. We now raise the tolerances for larger tensors. Pull Request resolved: pytorch#156699 Approved by: https://github.com/eqy, https://github.com/ngimel (cherry picked from commit 36dd598) Fixes SWDEV-547998 Co-authored-by: Ahmad Sharif <ahmads@fb.com>
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -7437,10 +7437,15 @@ def test_layer_norm_backwards_eps(self):
                 ln_out_cuda = ln_cuda(x_cuda)
                 ln_out.backward(grad_output)
                 ln_out_cuda.backward(grad_output_cuda)
+                atol = 1e-4
+                rtol = 1e-5
+                if m > 64 * 1024:
+                    atol = 1e-3
+                    rtol = 1e-3
                 if elementwise_affine:
-                    self.assertEqual(ln.weight.grad, ln_cuda.weight.grad, f"weight grad failed: {m=} {n=}", rtol=1e-4, atol=1e-4)
+                    self.assertEqual(ln.weight.grad, ln_cuda.weight.grad, f"weight grad failed: {m=} {n=}", rtol=rtol, atol=atol)
                 if bias and elementwise_affine:
-                    self.assertEqual(ln.bias.grad, ln_cuda.bias.grad, f"bias grad failed: {m=} {n=}", rtol=1e-5, atol=1e-4)
+                    self.assertEqual(ln.bias.grad, ln_cuda.bias.grad, f"bias grad failed: {m=} {n=}", rtol=rtol, atol=atol)
 
     @largeTensorTest("40GB", device="cuda")
     def test_layer_norm_large_tensor(self):