[release/2.7] [SWDEV-535259] enable miopen channels last 3d for conv and batchnorm (#2232)

okakarpa · jeffdaily · web-flow · commit bf0079dce7c9 · 2025-06-03T17:20:18.000-07:00
Cherry-pick of #2209 Co-authored-by: Jeff Daily <jeff.daily@amd.com>
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
@@ -362,20 +362,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten
     return false;
   }
 
-  bool can_use_miopen_channels_last_2d = false;
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
   // See #64427
   static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+  static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;
 
   auto input_memory_format = input.suggest_memory_format();
   auto weight_memory_format = weight.suggest_memory_format();
+  auto weight_ndim = weight.ndimension();
 
-  can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC &&  *PYTORCH_MIOPEN_SUGGEST_NHWC && (
-            ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
-            (weight_memory_format == at::MemoryFormat::ChannelsLast) )
-        );
+  bool can_use_miopen_channels_last_2d = suggest_nhwc && (weight_ndim == 4) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast)
+  );
 
-  bool can_use_miopen_channels_last_3d = false;
+  bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
+  );
 
   return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
@@ -1430,7 +1430,7 @@ static inline at::MemoryFormat determine_backend_memory_format(
       if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
         TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
             "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
-        backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
+        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
       }
       break;
     case ConvBackend::Mkldnn:
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
@@ -544,6 +544,7 @@ BatchNormBackend _select_batch_norm_backend(
       && (input.suggest_memory_format() == MemoryFormat::Contiguous
 #if (defined(USE_ROCM) && ROCM_VERSION >= 60500)
         || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
 #endif
         )
   ) {
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -763,7 +763,7 @@ Tensor miopen_convolution_forward(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor output_t = at::detail::empty_cuda(
@@ -872,7 +872,7 @@ Tensor miopen_depthwise_convolution_forward(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor output_t = at::detail::empty_cuda(
@@ -1074,7 +1074,7 @@ Tensor miopen_depthwise_convolution_backward_weight(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
@@ -1127,7 +1127,7 @@ Tensor miopen_convolution_backward_weight(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
@@ -1281,7 +1281,7 @@ Tensor miopen_convolution_backward_input(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_input_t = at::detail::empty_cuda(
@@ -1389,7 +1389,7 @@ Tensor miopen_depthwise_convolution_backward_input(
 
   auto memory_format = at::MemoryFormat::Contiguous;
   if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
   }
 
   Tensor grad_input_t = at::detail::empty_cuda(

Original file line number	Diff line number	Diff line change
`@@ -1430,7 +1430,7 @@ static inline at::MemoryFormat determine_backend_memory_format(`
`1430`	`1430`	`if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {`
`1431`	`1431`	`TORCH_INTERNAL_ASSERT((k == 4 \|\| k == 5),`
`1432`	`1432`	`"Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");`
`1433`		`- backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /at::MemoryFormat::ChannelsLast3d/ : at::MemoryFormat::ChannelsLast;`
	`1433`	`+ backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;`
`1434`	`1434`	`}`
`1435`	`1435`	`break;`
`1436`	`1436`	`case ConvBackend::Mkldnn:`
Original file line number	Diff line number	Diff line change
`@@ -544,6 +544,7 @@ BatchNormBackend _select_batch_norm_backend(`
`544`	`544`	`&& (input.suggest_memory_format() == MemoryFormat::Contiguous`
`545`	`545`	`#if (defined(USE_ROCM) && ROCM_VERSION >= 60500)`
`546`	`546`	`\|\| (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)`
	`547`	`+ \|\| (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)`
`547`	`548`	`#endif`
`548`	`549`	`)`
`549`	`550`	`) {`