ROCm
diff --git a/‎aten/src/ATen/miopen/Descriptors.cpp‎
Lines changed: 64 additions & 33 deletions b/‎aten/src/ATen/miopen/Descriptors.cpp‎
Lines changed: 64 additions & 33 deletions
diff --git a/‎aten/src/ATen/miopen/Descriptors.h‎
Lines changed: 68 additions & 5 deletions b/‎aten/src/ATen/miopen/Descriptors.h‎
Lines changed: 68 additions & 5 deletions
diff --git a/‎aten/src/ATen/native/ConvUtils.h‎
Lines changed: 15 additions & 7 deletions b/‎aten/src/ATen/native/ConvUtils.h‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎aten/src/ATen/native/Convolution.cpp‎
Lines changed: 5 additions & 4 deletions b/‎aten/src/ATen/native/Convolution.cpp‎
Lines changed: 5 additions & 4 deletions
@@ -1,13 +1,16 @@
 #include <ATen/miopen/Descriptors.h>
+
 #include <ATen/ATen.h>
 #include <c10/util/irange.h>
 
 #include <iostream>
+#include <sstream>
 
-namespace at { namespace native {
+namespace at::native {
 
 namespace {
 
+
 inline miopenDataType_t getDataType(const at::Tensor& t) {
   auto scalar_type = t.scalar_type();
   if (scalar_type == at::kFloat) {
@@ -16,34 +19,37 @@ inline miopenDataType_t getDataType(const at::Tensor& t) {
     return miopenHalf;
   } else if (scalar_type == at::kBFloat16) {
     return miopenBFloat16;
-  } else {
-    TORCH_CHECK(
-        false,
-        "TensorDescriptor only supports float, half and bfloat16 tensors");
   }
+  TORCH_CHECK(false, "TensorDescriptor does not support ", scalar_type);
 }
 
+constexpr size_t MIOPEN_DIM_MAX = 5;
+
 } // anonymous namespace
 
+void TensorDescriptor::set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad) {
+  set(getDataType(t), t.sizes(), t.strides(), pad,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
+}
 
 void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
-  set(getDataType(t), t.sizes(), t.strides(), pad);
+  auto memory_format = t.suggest_memory_format();
+  set(getDataType(t), t.sizes(), t.strides(), pad,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
 }
 
-constexpr size_t MIOPEN_DIM_MAX = 5;
-
 void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad) {
+  set(datatype, t_sizes, t_strides, pad,
+    is_channels_last_strides_2d(t_sizes, t_strides) ||
+    is_channels_last_strides_3d(t_sizes, t_strides));
+}
+
+void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) {
   size_t dim = t_sizes.size();
   if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
-#define _STR(X) #X
-#define STR(X) _STR(X)
-    TORCH_CHECK(
-        false,
-        "MIOpen supports only up to ",
-        STR(MIOPEN_DIM_MAX),
-        " dimensions");
-#undef _STR
-#undef STR
+    TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
   for (const auto i : c10::irange(dim)) {
@@ -54,7 +60,7 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
     size[i] = 1;
     stride[i] = 1;
   }
-  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride);
+  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride, nhwc);
 }
 
 std::string miopenTypeToString(miopenDataType_t dtype) {
@@ -74,10 +80,11 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
   out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
-  int nbDims = 4;
+  int nbDims = 0;
   int dimA[MIOPEN_DIM_MAX];
   int strideA[MIOPEN_DIM_MAX];
   miopenDataType_t dtype;
+  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
   miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
   out << "    type = " << miopenTypeToString(dtype) << "\n";
   out << "    nbDims = " << nbDims << "\n";
@@ -99,19 +106,17 @@ void TensorDescriptor::print() { std::cout << *this; }
 
 void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) {
   auto dim = t.ndimension();
-  if (dim > static_cast<int64_t>(MIOPEN_DIM_MAX) || pad > static_cast<int64_t>(MIOPEN_DIM_MAX)) {
-#define _STR(X) #X
-#define STR(X) _STR(X)
-    TORCH_CHECK(
-        false,
-        "MIOpen supports only up to ",
-        STR(MIOPEN_DIM_MAX),
-        " dimensions");
-#undef _STR
-#undef STR
-  }
+  if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
+  TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
+  // NB: It is possible for this test to be insufficient, because the
+  // Tensor passed in to set the filter descriptor may not be the actual
+  // Tensor whose data pointer is passed to cuDNN.  Nevertheless,
+  // that is the common case, so we can catch most client errors with this test.
   TORCH_CHECK(t.is_contiguous(memory_format),
-      "MIOpen filters (a.k.a. weights) must be contiguous");
+    "MIOpen filters (a.k.a. weights) must be contiguous in desired memory_format\n",
+    "Weight sizes: ", t.sizes(), "\n",
+    "Weight strides: ", t.strides(), "\n",
+    "cuDNN suggested memory_format: ", memory_format);
 
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
@@ -131,7 +136,33 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
   }
 
   dim = std::max<int64_t>(dim, pad);
-  set(getDataType(t), (int) dim, size, stride);
+  set(getDataType(t), static_cast<int>(dim), size, stride);
 }
 
-}}
+std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  int nbDims = 0;
+  int dimA[MIOPEN_DIM_MAX];
+  int strideA[MIOPEN_DIM_MAX];
+  miopenDataType_t dtype;
+  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
+  miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
+  out << "    type = " << miopenTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
+  // Read out only nbDims of the arrays!
+  out << "    dimA = ";
+  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  out << "    strideA = ";
+  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  return out;
+}
+
+void FilterDescriptor::print() { std::cout << *this; }
+
+}
@@ -1,13 +1,16 @@
 #pragma once
 
-#include <ATen/miopen/Exceptions.h>
+#include <string>
 
+#include <ATen/miopen/Exceptions.h>
 #include <ATen/miopen/miopen-wrapper.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <c10/macros/Export.h>
 
-namespace at { namespace native {
+namespace at::native {
+
+std::string miopenTypeToString(miopenDataType_t dtype);
 
 inline int dataSize(miopenDataType_t dataType)
 {
@@ -19,6 +22,43 @@ inline int dataSize(miopenDataType_t dataType)
   }
 }
 
+// The stride for a size-1 dimensions is not uniquely determined; in
+// fact, it can be anything you want, because the fact that the
+// tensor is size 1 at this dimension means that you will never actually
+// try advancing your pointer by this stride.
+//
+// We duplicate the CuDNN restriction here for MIOpen.
+//
+// However, CuDNN has a much more stringent requirement on strides:
+// if you are passing a contiguous input, it better be the case
+// that the stride for dim i is the product of the sizes of dims
+// i+1 to the end.  This stride is indeed uniquely determined.  This
+// function modifies 'stride' in place so this invariant holds.
+template <typename T>
+static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) {
+  int64_t z = 1;
+  int index = 0;
+  std::vector<int> permutation(dim);
+
+  if (nhwc) {
+    permutation[index++] = 1;
+  }
+  for (int d = dim-1; d > 1; d--) {
+    permutation[index++] = d;
+  }
+  if (!nhwc) {
+    permutation[index++] = 1;
+  }
+  permutation[index++] = 0;
+  for (int d : permutation) {
+    if (size[d] == 1) {
+      stride[d] = z;
+    } else {
+      z *= size[d];
+    }
+  }
+}
+
 template <typename T, miopenStatus_t (*dtor)(T*)>
 struct DescriptorDeleter {
   void operator()(T* x) {
@@ -41,6 +81,8 @@ template <typename T, miopenStatus_t (*ctor)(T**), miopenStatus_t (*dtor)(T*)>
 // NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_HIP_CPP_API Descriptor {
  public:
+  // TODO: Figure out why const-correctness doesn't work here
+
   // Use desc() to access the underlying descriptor pointer in
   // a read-only fashion.  Most client code should use this.
   // If the descriptor was never initialized, this will return
@@ -75,14 +117,32 @@ class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
     set(t, pad);
   }
 
+  // Note [MIOpen broadcast padding]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // pad specifies the minimum dimensionality of the tensor descriptor
+  // we produce (it doesn't have anything to do with, e.g., convolution
+  // padding).  If 't' is lower-dimensional than 'pad', the remaining
+  // dimensions (on the right) are padded with ones.  This doesn't
+  // affect the underlying data layout.  This is particularly useful for
+  // dealing with a peculiarity of the MIOpen API, which is that broadcasting in MIOpen is
+  // done in two steps: first, the client code is expected to pad out
+  // (the dimensions) input tensors to be the same dimension as the
+  // target broadcast, and then second, MIOpen takes of actually
+  // broadcasting size 1 dimensions.
+
   void set(const at::Tensor &t, size_t pad = 0);
+  void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0);
   void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);
 
   void print();
 
 private:
-  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
-    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);
+
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
+    std::vector<int> strides_copy(stride, stride + dim);
+    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
   }
 };
 
@@ -99,12 +159,15 @@ class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
 
   void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);
 
+  void print();
 private:
   void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
     MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
   }
 };
 
+std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d);
+
 struct TORCH_HIP_CPP_API ConvolutionDescriptor
     : public Descriptor<
           miopenConvolutionDescriptor,
@@ -166,4 +229,4 @@ union Constant
   }
 };
 
-}}  // namespace
+} // namespace
@@ -353,19 +353,21 @@ TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
 TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();
 
 
-inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
-
+inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
   // disable NHWC for float64 input.
   if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
       input.scalar_type() == at::kDouble ||
       weight.scalar_type() == at::kDouble) {
-    return false;
+    return at::MemoryFormat::Contiguous;
   }
 
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
-  // See #64427
-  static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
-  static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;
+  // See https://github.com/pytorch/pytorch/issues/64427.
+  // non static variable is used to be able to change environment variable in runtime for testing
+  // enabled by default for ROCm >= 7.0.0 with miopen 3.5
+  int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0;
+  bool is_miopen_3_5 = miopen_version >= 30500;  // ROCm 7.0
+  bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5);
 
   auto input_memory_format = input.suggest_memory_format();
   auto weight_memory_format = weight.suggest_memory_format();
@@ -375,13 +377,19 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten
     (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
     (weight_memory_format == at::MemoryFormat::ChannelsLast)
   );
+  if (can_use_miopen_channels_last_2d) {
+    return at::MemoryFormat::ChannelsLast;
+  }
 
   bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
     (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
     (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
   );
+  if (can_use_miopen_channels_last_3d) {
+    return at::MemoryFormat::ChannelsLast3d;
+  }
 
-  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
+  return at::MemoryFormat::Contiguous;
 }
 
 inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
 
@@ -459,6 +459,9 @@ struct ConvParams {
 
   // Use cudnn for FP16 depthwise convolutions
   bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
+    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+      return false;
+    }
     if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
       // always use cudnn_depthwise for channels_last format
       return true;
@@ -1419,10 +1422,8 @@ static inline at::MemoryFormat determine_backend_memory_format(
     case ConvBackend::Miopen:
     case ConvBackend::MiopenDepthwise:
     case ConvBackend::MiopenTranspose:
-      if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
-        TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
-            "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
-        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+      if (detail::getCUDAHooks().compiledWithMIOpen()) {
+        backend_memory_format = miopen_conv_suggest_memory_format(input, weight);
       }
       break;
     case ConvBackend::Mkldnn: