Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions aten/src/ATen/cudnn/Descriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType)
}
}

// NOTE [ cudnn fixSizeOneDimStride ]
// The stride for a size-1 dimensions is not uniquely determined; in
// fact, it can be anything you want, because the fact that the
// tensor is size 1 at this dimension means that you will never actually
Expand Down
63 changes: 35 additions & 28 deletions aten/src/ATen/miopen/Descriptors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,37 @@ inline miopenDataType_t getDataType(const at::Tensor& t) {
} else {
TORCH_CHECK(
false,
"TensorDescriptor only supports float, half and bfloat16 tensors");
"TensorDescriptor does not support ", scalar_type);
}
}

} // anonymous namespace

constexpr size_t MIOPEN_DIM_MAX = 5;

void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
set(getDataType(t), t.sizes(), t.strides(), pad);
void TensorDescriptor::set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad) {
set(getDataType(t), t.sizes(), t.strides(), pad,
memory_format == at::MemoryFormat::ChannelsLast ||
memory_format == at::MemoryFormat::ChannelsLast3d);
}

constexpr size_t MIOPEN_DIM_MAX = 5;
void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
auto memory_format = t.suggest_memory_format();
set(getDataType(t), t.sizes(), t.strides(), pad,
memory_format == at::MemoryFormat::ChannelsLast ||
memory_format == at::MemoryFormat::ChannelsLast3d);
}

void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad) {
set(datatype, t_sizes, t_strides, pad,
is_channels_last_strides_2d(t_sizes, t_strides) ||
is_channels_last_strides_3d(t_sizes, t_strides));
}

void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) {
size_t dim = t_sizes.size();
if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
#define _STR(X) #X
#define STR(X) _STR(X)
TORCH_CHECK(
false,
"MIOpen supports only up to ",
STR(MIOPEN_DIM_MAX),
" dimensions");
#undef _STR
#undef STR
TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
int size[MIOPEN_DIM_MAX];
int stride[MIOPEN_DIM_MAX];
for (const auto i : c10::irange(dim)) {
Expand All @@ -54,7 +60,7 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
size[i] = 1;
stride[i] = 1;
}
set(datatype, static_cast<int>(std::max(dim, pad)), size, stride);
set(datatype, static_cast<int>(std::max(dim, pad)), size, stride, nhwc);
}

std::string miopenTypeToString(miopenDataType_t dtype) {
Expand All @@ -74,10 +80,11 @@ std::string miopenTypeToString(miopenDataType_t dtype) {

std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
int nbDims = 4;
int nbDims = 0;
int dimA[MIOPEN_DIM_MAX];
int strideA[MIOPEN_DIM_MAX];
miopenDataType_t dtype;
miopenGetTensorDescriptorSize(d.desc(), &nbDims);
miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
out << " type = " << miopenTypeToString(dtype) << "\n";
out << " nbDims = " << nbDims << "\n";
Expand All @@ -99,19 +106,17 @@ void TensorDescriptor::print() { std::cout << *this; }

void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) {
auto dim = t.ndimension();
if (dim > static_cast<int64_t>(MIOPEN_DIM_MAX) || pad > static_cast<int64_t>(MIOPEN_DIM_MAX)) {
#define _STR(X) #X
#define STR(X) _STR(X)
TORCH_CHECK(
false,
"MIOpen supports only up to ",
STR(MIOPEN_DIM_MAX),
" dimensions");
#undef _STR
#undef STR
}
if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
// NB: It is possible for this test to be insufficient, because the
// Tensor passed in to set the filter descriptor may not be the actual
// Tensor whose data pointer is passed to cuDNN. Nevertheless,
// that is the common case, so we can catch most client errors with this test.
TORCH_CHECK(t.is_contiguous(memory_format),
"MIOpen filters (a.k.a. weights) must be contiguous");
"MIOpen filters (a.k.a. weights) must be contiguous in desired memory_format\n",
"Weight sizes: ", t.sizes(), "\n",
"Weight strides: ", t.strides(), "\n",
"cuDNN suggested memory_format: ", memory_format);

int size[MIOPEN_DIM_MAX];
int stride[MIOPEN_DIM_MAX];
Expand All @@ -131,7 +136,9 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
}

dim = std::max<int64_t>(dim, pad);
set(getDataType(t), (int) dim, size, stride);
set(getDataType(t), static_cast<int>(dim), size, stride,
memory_format == at::MemoryFormat::ChannelsLast ||
memory_format == at::MemoryFormat::ChannelsLast3d);
}

}}
46 changes: 41 additions & 5 deletions aten/src/ATen/miopen/Descriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

namespace at { namespace native {

std::string miopenTypeToString(miopenDataType_t dtype);

inline int dataSize(miopenDataType_t dataType)
{
switch (dataType) {
Expand All @@ -19,6 +21,32 @@ inline int dataSize(miopenDataType_t dataType)
}
}

// See NOTE [ cudnn fixSizeOneDimStride ] in aten/src/ATen/cudnn/Descriptors.h
template <typename T>
static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) {
int64_t z = 1;
int index = 0;
std::vector<int> permutation(dim);

if (nhwc) {
permutation[index++] = 1;
}
for (int d = dim-1; d > 1; d--) {
permutation[index++] = d;
}
if (!nhwc) {
permutation[index++] = 1;
}
permutation[index++] = 0;
for (int d : permutation) {
if (size[d] == 1) {
stride[d] = z;
} else {
z *= size[d];
}
}
}

template <typename T, miopenStatus_t (*dtor)(T*)>
struct DescriptorDeleter {
void operator()(T* x) {
Expand Down Expand Up @@ -75,14 +103,20 @@ class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
set(t, pad);
}

// See Note [CuDNN broadcast padding]
void set(const at::Tensor &t, size_t pad = 0);
void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0);
void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);

void print();

private:
void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);

void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
std::vector<int> strides_copy(stride, stride + dim);
fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
}
};

Expand All @@ -100,8 +134,10 @@ class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);

private:
void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
std::vector<int> strides_copy(stride, stride + dim);
fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
}
};

Expand Down Expand Up @@ -166,4 +202,4 @@ union Constant
}
};

}} // namespace
}} // namespace
27 changes: 20 additions & 7 deletions aten/src/ATen/native/ConvUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -353,19 +353,21 @@ TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();


inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {

inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
// disable NHWC for float64 input.
if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
input.scalar_type() == at::kDouble ||
weight.scalar_type() == at::kDouble) {
return false;
return at::MemoryFormat::Contiguous;
}

// TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
// See #64427
static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;
// See https://github.com/pytorch/pytorch/issues/64427.
// non static variable is used to be able to change environment variable in runtime for testing
// enabled by default for ROCm >= 7.0.0 with miopen 3.5
int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0;
bool is_miopen_3_5 = miopen_version >= 30500; // ROCm 7.0
bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5);

auto input_memory_format = input.suggest_memory_format();
auto weight_memory_format = weight.suggest_memory_format();
Expand All @@ -375,13 +377,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten
(input_memory_format == at::MemoryFormat::ChannelsLast) ||
(weight_memory_format == at::MemoryFormat::ChannelsLast)
);
if (can_use_miopen_channels_last_2d) {
return at::MemoryFormat::ChannelsLast;
}

bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
(input_memory_format == at::MemoryFormat::ChannelsLast3d) ||
(weight_memory_format == at::MemoryFormat::ChannelsLast3d)
);
if (can_use_miopen_channels_last_3d) {
return at::MemoryFormat::ChannelsLast3d;
}

return at::MemoryFormat::Contiguous;
}

return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
// deprecated, but to remove would be BC-breaking
inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
return miopen_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous;
}

inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
Expand Down
9 changes: 5 additions & 4 deletions aten/src/ATen/native/Convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,9 @@ struct ConvParams {

// Use cudnn for FP16 depthwise convolutions
bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const {
if (!detail::getCUDAHooks().compiledWithCuDNN()) {
return false;
}
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
// always use cudnn_depthwise for channels_last format
return true;
Expand Down Expand Up @@ -1418,10 +1421,8 @@ static inline at::MemoryFormat determine_backend_memory_format(
case ConvBackend::Miopen:
case ConvBackend::MiopenDepthwise:
case ConvBackend::MiopenTranspose:
if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
"Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
if (detail::getCUDAHooks().compiledWithMIOpen()) {
backend_memory_format = miopen_conv_suggest_memory_format(input, weight);
}
break;
case ConvBackend::Mkldnn:
Expand Down
Loading