Move deprecated test

matthewdouglas · matthewdouglas · commit d1d0dfe82a60 · 2025-06-04T15:10:31.000-04:00
diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py
@@ -142,3 +142,34 @@ def test_matmul_fp8(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
                     grad_err = (gradB1 - gradB2).abs().mean()
                     assert grad_err.item() < 0.003
                     torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3)
+
+
+@pytest.mark.deprecated
+def test_fp8linear():
+    b = 10
+    h = 1024
+    inp = torch.randn(b, h).cuda()
+    fp32 = torch.nn.Linear(h, h * 2).cuda()
+    fp8 = bnb.research.nn.LinearFP8Mixed(h, h * 2).cuda()
+    fp32b = torch.nn.Linear(h * 2, h).cuda()
+    fp8b = bnb.research.nn.LinearFP8Mixed(h * 2, h).cuda()
+
+    fp8.weight.data.copy_(fp32.weight.data)
+    fp8.bias.data.copy_(fp32.bias.data)
+    fp8b.weight.data.copy_(fp32b.weight.data)
+    fp8b.bias.data.copy_(fp32b.bias.data)
+
+    a = fp32b(torch.nn.functional.gelu(fp32(inp)))
+    b = fp8b(torch.nn.functional.gelu(fp8(inp)))
+
+    err = (a - b).abs().mean()
+
+    a.mean().backward()
+    b.mean().backward()
+
+    graderr = (fp8.weight.grad - fp32.weight.grad).abs().mean()
+    bgraderr = (fp8.bias.grad - fp32.bias.grad).abs().mean()
+
+    assert err < 0.05
+    assert graderr < 0.00002
+    assert bgraderr < 0.00002
diff --git a/tests/test_modules.py b/tests/test_modules.py
@@ -343,37 +343,6 @@ def test_kbit_backprop(device, module):
         assert kbit[0].weight.grad is None or kbit[0].bias.grad.sum().item() == 0
 
 
-@pytest.mark.deprecated
-def test_fp8linear():
-    b = 10
-    h = 1024
-    inp = torch.randn(b, h).cuda()
-    fp32 = torch.nn.Linear(h, h * 2).cuda()
-    fp8 = bnb.research.nn.LinearFP8Mixed(h, h * 2).cuda()
-    fp32b = torch.nn.Linear(h * 2, h).cuda()
-    fp8b = bnb.research.nn.LinearFP8Mixed(h * 2, h).cuda()
-
-    fp8.weight.data.copy_(fp32.weight.data)
-    fp8.bias.data.copy_(fp32.bias.data)
-    fp8b.weight.data.copy_(fp32b.weight.data)
-    fp8b.bias.data.copy_(fp32b.bias.data)
-
-    a = fp32b(torch.nn.functional.gelu(fp32(inp)))
-    b = fp8b(torch.nn.functional.gelu(fp8(inp)))
-
-    err = (a - b).abs().mean()
-
-    a.mean().backward()
-    b.mean().backward()
-
-    graderr = (fp8.weight.grad - fp32.weight.grad).abs().mean()
-    bgraderr = (fp8.bias.grad - fp32.bias.grad).abs().mean()
-
-    assert err < 0.05
-    assert graderr < 0.00002
-    assert bgraderr < 0.00002
-
-
 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("embedding_dim", [64, 65])
 @pytest.mark.parametrize("input_shape", [(10,), (10, 10), (10, 10, 10)], ids=str)