[release/2.6] Add 3D batchnorm tests (#2214)

dnikolaev-amd · jeffdaily · web-flow · commit bbf4b9bd621d · 2025-06-04T18:20:25.000-05:00
Additive on top of #2209 3D batchhorm tests (NHWC3D and NCHW3D) NCHW 3D tests: ``` test_batchnorm_3D_inference_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.149s) test_batchnorm_3D_inference_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.062s) test_batchnorm_3D_inference_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.042s) test_batchnorm_3D_inference_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.091s) test_batchnorm_3D_inference_NCHW_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.008s) test_batchnorm_3D_inference_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.007s) test_batchnorm_3D_inference_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.028s) test_batchnorm_3D_inference_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.010s) test_batchnorm_3D_inference_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.010s) test_batchnorm_3D_inference_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.091s) test_batchnorm_3D_inference_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.020s) test_batchnorm_3D_inference_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.023s) test_batchnorm_3D_inference_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.010s) test_batchnorm_3D_inference_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.015s) test_batchnorm_3D_inference_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.007s) test_batchnorm_3D_train_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.011s) test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_3D_train_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_3D_train_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16 (__main__.TestNN) ... skip: bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600 (0.002s) test_batchnorm_3D_train_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_3D_train_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.006s) test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_3D_train_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_3D_train_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_3D_train_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_3D_train_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.011s) test_batchnorm_3D_train_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_3D_train_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s) ``` Old batchnorm tests will have `2D` it their names ``` test_batchnorm_2D_inference_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.023s) test_batchnorm_2D_inference_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.005s) test_batchnorm_2D_inference_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.005s) test_batchnorm_2D_inference_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.104s) test_batchnorm_2D_inference_NCHW_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_inference_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.003s) test_batchnorm_2D_inference_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.020s) test_batchnorm_2D_inference_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_inference_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_inference_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_inference_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.003s) test_batchnorm_2D_inference_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.003s) test_batchnorm_2D_inference_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.003s) test_batchnorm_2D_inference_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.003s) test_batchnorm_2D_inference_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.003s) test_batchnorm_2D_train_NCHW_vs_cpu_float32 (__main__.TestNN) ... ok (0.011s) test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_2D_train_NCHW_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_2D_train_NCHW_vs_native_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16 (__main__.TestNN) ... skip: bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600 (0.002s) test_batchnorm_2D_train_NCHW_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_train_NHWC_vs_NCHW_float32 (__main__.TestNN) ... ok (0.006s) test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_float16 (__main__.TestNN) ... ok (0.006s) test_batchnorm_2D_train_NHWC_vs_cpu_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_train_NHWC_vs_cpu_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_train_NHWC_vs_cpu_mixed_float16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_train_NHWC_vs_native_float32 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_train_NHWC_vs_native_mixed_bfloat16 (__main__.TestNN) ... ok (0.004s) test_batchnorm_2D_train_NHWC_vs_native_mixed_float16 (__main__.TestNN) ... ok (0.004s) ``` Tested in `compute-rocm-dkms-no-npi-hipclang` image build 16062: `compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:16062_ubuntu22.04_py3.10_pytorch_lw_release-2.7_1fee1967` Tests can be run with environment variable `MIOPEN_ENABLE_LOGGING_CMD=1` to collect MIOpenDriver commands ``` MIOPEN_ENABLE_LOGGING_CMD=1 python test_nn.py -v -k test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16 test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16 (__main__.TestNN) ... MIOpen(HIP): Command [LogCmdBNorm] ./bin/MIOpenDriver bnormbfp16 -n 4 -c 8 -D 2 -H 2 -W 2 -m 1 --forw 1 -b 0 -r 1 -s 1 --layout NDHWC MIOpen(HIP): Command [LogCmdBNorm] ./bin/MIOpenDriver bnormbfp16 -n 4 -c 8 -D 2 -H 2 -W 2 -m 1 --forw 0 -b 1 -s 1 --layout NDHWC MIOpen(HIP): Command [LogCmdBNorm] ./bin/MIOpenDriver bnormbfp16 -n 4 -c 8 -D 2 -H 2 -W 2 -m 1 --forw 1 -b 0 -r 1 -s 1 --layout NCDHW MIOpen(HIP): Command [LogCmdBNorm] ./bin/MIOpenDriver bnormbfp16 -n 4 -c 8 -D 2 -H 2 -W 2 -m 1 --forw 0 -b 1 -s 1 --layout NCDHW ok ``` Co-authored-by: Jeff Daily <jeff.daily@amd.com>
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -36,7 +36,7 @@
     IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
     skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION, _get_torch_rocm_version
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
@@ -5089,6 +5089,7 @@ def test_batchnorm_nhwc_cuda(self):
             self.assertEqual(out1, out2)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @parametrize_test("dims", [2, 3], name_fn=lambda x: f"{x}D")
     @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x)
     @parametrize_test(
         # test verifies cudnn/miopen batchnorm with the reference backend or memory format
@@ -5104,14 +5105,11 @@ def test_batchnorm_nhwc_cuda(self):
         [
             ("NCHW", "cpu", False, torch.float),
             ("NCHW", "cpu", True, torch.half),
-            # NCHW bfloat16 path uses native kernels for rocm<=6.3
-            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
-            subtest(("NCHW", "cpu", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+            ("NCHW", "cpu", True, torch.bfloat16),
 
             ("NCHW", "native", False, torch.float),
             ("NCHW", "native", True, torch.half),
-            # this config failed for train and passed for inference on ROCm6.4
-            # subtest(("NCHW", "native", True, torch.bfloat16), decorators=[unittest.expectedFailure]),
+            ("NCHW", "native", True, torch.bfloat16),
 
             ("NHWC", "cpu", False, torch.float),
             ("NHWC", "cpu", True, torch.half),
@@ -5123,21 +5121,41 @@ def test_batchnorm_nhwc_cuda(self):
 
             ("NHWC", "NCHW", False, torch.float),
             ("NHWC", "NCHW", True, torch.half),
-            # NCHW bfloat16 path uses native kernels for rocm<=6.3
-            # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
-            subtest(("NHWC", "NCHW", True, torch.bfloat16), decorators=[skipIfRocmVersionLessThan((6, 4))]),
+            ("NHWC", "NCHW", True, torch.bfloat16),
         ],
         name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
     )
-    def test_batchnorm(self, mode, memory_format, ref_backend, mixed, dtype):
+    def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
+        if torch.version.hip:
+            if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
+                                    "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16"
+                                    ) and _get_torch_rocm_version() < (6, 4):
+                # NCHW bfloat16 path uses native kernels for rocm<=6.3
+                # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+                self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
+
+            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
+                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
+                                        ) and _get_torch_rocm_version() >= (6, 4):
+                self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
+
+            if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16" \
+                and _get_torch_rocm_version() < (6, 4):
+                self.skipTest("3D float16 NCHW train failed on ROCm<=6.3 ")
+
+        if dims == 3 and memory_format in ("NHWC", "NCHW"):
+            memory_format = memory_format + "3D"
+
         def _create_tensor(size, memory_format, dtype, device):
             t = torch.empty(size=size, memory_format=memory_format, dtype=dtype, device=device)
             t = t.random_(1, 10)
             return t
 
         def _get_ref_device(backend: str , device: str):
             # If 'backend' specifies the memory format, return 'device' arg, otherwise return a device matches the backend
-            if backend in ("NHWC", "NCHW"):
+            if backend in ("NHWC", "NHWC3D", "NCHW", "NCHW3D"):
                 return device
             if backend == "native":
                 return "cuda"
@@ -5150,9 +5168,11 @@ def _get_backend_memory_format(backend: str, memory_format: torch.memory_format)
             # If 'backend' specifies the memory format, return it, otherwise look at 'memory_format' arg
             if backend == "NHWC":
                 return torch.channels_last
-            if backend == "NCHW":
+            if backend == "NHWC3D":
+                return torch.channels_last_3d
+            if backend in ("NCHW", "NCHW3D"):
                 return torch.contiguous_format
-            if memory_format in (torch.contiguous_format, torch.channels_last):
+            if memory_format in (torch.contiguous_format, torch.channels_last, torch.channels_last_3d):
                 return memory_format
             raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}")
 
@@ -5161,10 +5181,24 @@ def _get_memory_format(t: torch.Tensor) -> torch.memory_format:
                 return torch.contiguous_format
             if t.is_contiguous(memory_format=torch.channels_last):
                 return torch.channels_last
+            if t.is_contiguous(memory_format=torch.channels_last_3d):
+                return torch.channels_last_3d
+            return ValueError("Unsupported memory_format")
+
+        def _get_memory_format_from_name(memory_format_name: str) -> torch.memory_format:
+            if memory_format_name == "NHWC":
+                return torch.channels_last
+            elif memory_format_name == "NHWC3D":
+                return torch.channels_last_3d
+            elif memory_format_name in ("NCHW", "NCHW3D"):
+                return torch.contiguous_format
             return ValueError("Unsupported memory_format")
 
         def _create_backend(inp: torch.Tensor, mixed: bool = False):
-            mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
+
+            mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) \
+                if inp.dim() == 4 else \
+                    nn.BatchNorm3d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
             return mod
 
         def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
@@ -5191,12 +5225,13 @@ def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
             self.assertEqual(mod.running_var, ref_mod.running_var)
             self.assertEqual(inp.grad, ref_inp.grad)
 
-        def _train(memory_format, ref_backend, mixed, dtype):
-            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+        def _train(memory_format_name, ref_backend, mixed, dtype):
+            memory_format = _get_memory_format_from_name(memory_format_name)
+
             ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
             ref_device = _get_ref_device(ref_backend, device="cuda")
 
-            size = (4, 8, 2, 2)
+            size = (4, 8, 2, 2, 2) if memory_format_name in ("NCHW3D", "NHWC3D") else (4, 8, 2, 2)
             inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
             grad = _create_tensor(size, memory_format, dtype, device="cuda")
             ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_()
@@ -5224,12 +5259,12 @@ def _train(memory_format, ref_backend, mixed, dtype):
             # _test_batchnorm_train(input=input, grad=grad, mixed=mixed,
             #                       ref_input=ref_input, ref_grad=ref_grad, ref_backend=ref_backend)
 
-        def _inference(memory_format, ref_backend, mixed, dtype):
-            memory_format = torch.contiguous_format if memory_format == "NCHW" else torch.channels_last
+        def _inference(memory_format_name, ref_backend, mixed, dtype):
+            memory_format = _get_memory_format_from_name(memory_format_name)
             ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
             ref_device = _get_ref_device(ref_backend, device="cuda")
 
-            size = (2, 64, 50, 50)
+            size = (2, 64, 50, 50, 50) if memory_format_name in ("NCHW3D", "NHWC3D") else (2, 64, 50, 50)
             inp = _create_tensor(size, memory_format, dtype, device="cuda")
             ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
             mod = _create_backend(inp, mixed).eval()