[AUTOGENERATED] [release/2.7] Add 3D batchnorm tests (#2243)

okakarpa · dnikolaev-amd · jeffdaily · jithunnair-amd · commit e4d62b137876 · 2025-07-22T18:41:39.000-05:00
Cherry-pick of #2214 Co-authored-by: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com> (cherry picked from commit 5631e07)
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -37,7 +37,7 @@
     IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
     skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION, _get_torch_rocm_version
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, get_new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
@@ -5140,6 +5140,7 @@ def test_batchnorm_nhwc_cuda(self):
             self.assertEqual(out1, out2)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @parametrize_test("dims", [2, 3], name_fn=lambda x: f"{x}D")
     @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x)
     @parametrize_test(
         # test verifies cudnn/miopen batchnorm with the reference backend or memory format
@@ -5155,14 +5156,11 @@ def test_batchnorm_nhwc_cuda(self):
         [
             ("NCHW", "cpu", False, torch.float),
             ("NCHW", "cpu", True, torch.half),
-            # NCHW bfloat16 path uses native kernels for rocm<=6.3
-            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
-            subtest(("NCHW", "cpu", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+            ("NCHW", "cpu", True, torch.bfloat16),
 
             ("NCHW", "native", False, torch.float),
             ("NCHW", "native", True, torch.half),
-            # this config failed for train and passed for inference on ROCm6.4
-            # subtest(("NCHW", "native", True, torch.bfloat16), decorators=[unittest.expectedFailure]),
+            ("NCHW", "native", True, torch.bfloat16),
 
             ("NHWC", "cpu", False, torch.float),
             ("NHWC", "cpu", True, torch.half),
@@ -5174,21 +5172,41 @@ def test_batchnorm_nhwc_cuda(self):
 
             ("NHWC", "NCHW", False, torch.float),
             ("NHWC", "NCHW", True, torch.half),
-            # NCHW bfloat16 path uses native kernels for rocm<=6.3
-            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
-            subtest(("NHWC", "NCHW", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+            ("NHWC", "NCHW", True, torch.bfloat16),
         ],
         name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
     )
-    def test_batchnorm(self, mode, memory_format, ref_backend, mixed, dtype):
+    def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
+        if torch.version.hip:
+            if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16"
+                                    ) and _get_torch_rocm_version() < (6, 4):
+                # NCHW bfloat16 path uses native kernels for rocm<=6.3
+                # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+                self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
+
+            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
+                                        ) and _get_torch_rocm_version() >= (6, 4):
+                self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
+
+            if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16" \
+                and _get_torch_rocm_version() < (6, 4):
+                self.skipTest("3D float16 NCHW train failed on ROCm<=6.3 ")
+
+        if dims == 3 and memory_format in ("NHWC", "NCHW"):
+            memory_format = memory_format + "3D"
+
         def _create_tensor(size, memory_format, dtype, device):
             t = torch.empty(size=size, memory_format=memory_format, dtype=dtype, device=device)
             t = t.random_(1, 10)
             return t
 
         def _get_ref_device(backend: str , device: str):
             # If 'backend' specifies the memory format, return 'device' arg, otherwise return a device matches the backend
-            if backend in ("NHWC", "NCHW"):
+            if backend in ("NHWC", "NHWC3D", "NCHW", "NCHW3D"):
                 return device
             if backend == "native":
                 return "cuda"
@@ -5201,9 +5219,11 @@ def _get_backend_memory_format(backend: str, memory_format: torch.memory_format)
             # If 'backend' specifies the memory format, return it, otherwise look at 'memory_format' arg
             if backend == "NHWC":
                 return torch.channels_last
-            if backend == "NCHW":
+            if backend == "NHWC3D":
+                return torch.channels_last_3d
+            if backend in ("NCHW", "NCHW3D"):
                 return torch.contiguous_format
-            if memory_format in (torch.contiguous_format, torch.channels_last):
+            if memory_format in (torch.contiguous_format, torch.channels_last, torch.channels_last_3d):
                 return memory_format
             raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}")
 
@@ -5212,10 +5232,24 @@ def _get_memory_format(t: torch.Tensor) -> torch.memory_format:
                 return torch.contiguous_format
             if t.is_contiguous(memory_format=torch.channels_last):
                 return torch.channels_last
+            if t.is_contiguous(memory_format=torch.channels_last_3d):
+                return torch.channels_last_3d
+            return ValueError("Unsupported memory_format")
+
+        def _get_memory_format_from_name(memory_format_name: str) -> torch.memory_format:
+            if memory_format_name == "NHWC":
+                return torch.channels_last
+            elif memory_format_name == "NHWC3D":
+                return torch.channels_last_3d
+            elif memory_format_name in ("NCHW", "NCHW3D"):
+                return torch.contiguous_format
             return ValueError("Unsupported memory_format")
 
         def _create_backend(inp: torch.Tensor, mixed: bool = False):
-            mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
+
+            mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) \
+                if inp.dim() == 4 else \
+                    nn.BatchNorm3d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
             return mod
 
         def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
@@ -5242,12 +5276,13 @@ def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
             self.assertEqual(mod.running_var, ref_mod.running_var)
             self.assertEqual(inp.grad, ref_inp.grad)
 
-        def _train(memory_format, ref_backend, mixed, dtype):
-            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+        def _train(memory_format_name, ref_backend, mixed, dtype):
+            memory_format = _get_memory_format_from_name(memory_format_name)
+
             ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
             ref_device = _get_ref_device(ref_backend, device="cuda")
 
-            size = (4, 8, 2, 2)
+            size = (4, 8, 2, 2, 2) if memory_format_name in ("NCHW3D", "NHWC3D") else (4, 8, 2, 2)
             inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
             grad = _create_tensor(size, memory_format, dtype, device="cuda")
             ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_()
@@ -5275,12 +5310,12 @@ def _train(memory_format, ref_backend, mixed, dtype):
             # _test_batchnorm_train(input=input, grad=grad, mixed=mixed,
             #                       ref_input=ref_input, ref_grad=ref_grad, ref_backend=ref_backend)
 
-        def _inference(memory_format, ref_backend, mixed, dtype):
-            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+        def _inference(memory_format_name, ref_backend, mixed, dtype):
+            memory_format = _get_memory_format_from_name(memory_format_name)
             ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
             ref_device = _get_ref_device(ref_backend, device="cuda")
 
-            size = (2, 64, 50, 50)
+            size = (2, 64, 50, 50, 50) if memory_format_name in ("NCHW3D", "NHWC3D") else (2, 64, 50, 50)
             inp = _create_tensor(size, memory_format, dtype, device="cuda")
             ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
             mod = _create_backend(inp, mixed).eval()