Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,6 @@ scipy==1.14.1 ; python_version > "3.9"
#Pinned versions:
#test that import:

tb-nightly==2.13.0a20230426
#Description: TensorBoard
#Pinned versions:
#test that import:

# needed by torchgen utils
typing-extensions>=4.10.0
#Description: type hints for python
Expand Down
6 changes: 5 additions & 1 deletion .github/scripts/build_triton_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,12 @@ def build_triton(

triton_repo = "https://github.com/openai/triton"
if device == "rocm":
triton_pkg_name = "pytorch-triton-rocm"
triton_repo = "https://github.com/ROCm/triton"
rocm_version = get_rocm_version() # e.g., "7.0.1"
if tuple(map(int, rocm_version.split("."))) > (7, 0, 0):
triton_pkg_name = "triton"
else:
triton_pkg_name = "pytorch-triton-rocm"
elif device == "xpu":
triton_pkg_name = "pytorch-triton-xpu"
triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ at::BlasBackend Context::blasPreferredBackend() {
static const std::vector<std::string> archs = {
"gfx90a", "gfx942",
#if ROCM_VERSION >= 60400
"gfx1200", "gfx1201",
"gfx1150", "gfx1151", "gfx1200", "gfx1201",
#endif
#if ROCM_VERSION >= 60500
"gfx950"
Expand All @@ -362,7 +362,7 @@ at::BlasBackend Context::blasPreferredBackend() {
static const std::vector<std::string> archs = {
"gfx90a", "gfx942",
#if ROCM_VERSION >= 60300
"gfx1100", "gfx1101", "gfx1200", "gfx1201",
"gfx1100", "gfx1101", "gfx1150", "gfx1151", "gfx1200", "gfx1201",
#endif
#if ROCM_VERSION >= 60500
"gfx950"
Expand Down
12 changes: 6 additions & 6 deletions aten/src/ATen/miopen/Descriptors.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct DescriptorDeleter {
// function.
template <typename T, miopenStatus_t (*ctor)(T**), miopenStatus_t (*dtor)(T*)>
// NOLINTNEXTLINE(bugprone-exception-escape)
class TORCH_CUDA_CPP_API Descriptor {
class TORCH_HIP_CPP_API Descriptor {
public:
// Use desc() to access the underlying descriptor pointer in
// a read-only fashion. Most client code should use this.
Expand All @@ -65,7 +65,7 @@ class TORCH_CUDA_CPP_API Descriptor {
std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
};

class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
miopenTensorDescriptor,
&miopenCreateTensorDescriptor,
&miopenDestroyTensorDescriptor> {
Expand All @@ -88,7 +88,7 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<

std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);

class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
miopenTensorDescriptor,
&miopenCreateTensorDescriptor,
&miopenDestroyTensorDescriptor> {
Expand All @@ -105,7 +105,7 @@ class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
}
};

struct TORCH_CUDA_CPP_API ConvolutionDescriptor
struct TORCH_HIP_CPP_API ConvolutionDescriptor
: public Descriptor<
miopenConvolutionDescriptor,
&miopenCreateConvolutionDescriptor,
Expand All @@ -121,7 +121,7 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor
};

// NOLINTNEXTLINE(bugprone-exception-escape)
struct TORCH_CUDA_CPP_API DropoutDescriptor
struct TORCH_HIP_CPP_API DropoutDescriptor
: public Descriptor<
miopenDropoutDescriptor,
&miopenCreateDropoutDescriptor,
Expand All @@ -137,7 +137,7 @@ struct TORCH_CUDA_CPP_API DropoutDescriptor
}
};

struct TORCH_CUDA_CPP_API RNNDescriptor
struct TORCH_HIP_CPP_API RNNDescriptor
: public Descriptor<miopenRNNDescriptor,
&miopenCreateRNNDescriptor,
&miopenDestroyRNNDescriptor>
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/miopen/Handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@

namespace at::native {

TORCH_CUDA_CPP_API miopenHandle_t getMiopenHandle();
TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle();
} // namespace at::native
2 changes: 1 addition & 1 deletion aten/src/ATen/miopen/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace at::native {

TORCH_CUDA_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);

int64_t miopen_version();

Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/native/cuda/Blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ static bool isSupportedHipLtROCmArch(int index) {
static const std::vector<std::string> archs = {
"gfx90a", "gfx942",
#if ROCM_VERSION >= 60300
"gfx1100", "gfx1101", "gfx1200", "gfx1201",
"gfx1100", "gfx1101", "gfx1150", "gfx1151", "gfx1200", "gfx1201",
#endif
#if ROCM_VERSION >= 60500
"gfx950"
Expand Down
27 changes: 27 additions & 0 deletions aten/src/ATen/native/cuda/Reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -797,14 +797,23 @@ struct ReduceOp {
if (should_store) {
index_t offset = config.staging_memory_offset(blockIdx.y);
reduce_buffer[offset] = value;
#ifdef USE_ROCM
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); // make sure writes are globally visible
#endif
}

#ifndef USE_ROCM
__threadfence(); // make sure writes are globally visible
#endif
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
bool is_last_block_done = mark_block_finished();

if (is_last_block_done) {
#ifdef USE_ROCM
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); // complete the acquire pattern after release
#else
__threadfence(); // complete the acquire pattern after atomic
#endif
for (auto &v : value) {
v = ident;
}
Expand All @@ -822,6 +831,23 @@ struct ReduceOp {
} else {
index_t input_offset = threadIdx.y;
index_t step = blockDim.y;
#ifdef USE_ROCM // Prefetch loads to better hide their latency
#define PRFCH 4
for (; input_offset < config.ctas_per_output; input_offset += step*PRFCH) {
arg_vec_t next[PRFCH];
#pragma unroll
for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) {
index_t idx = config.staging_memory_offset(input_offset + u*step);
next[u] = reduce_buffer[idx];
}
for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = ops.combine(value[i], next[u][i]);
}
}
}
#else
for (; input_offset < config.ctas_per_output; input_offset += step) {
index_t idx = config.staging_memory_offset(input_offset);
arg_vec_t next = reduce_buffer[idx];
Expand All @@ -830,6 +856,7 @@ struct ReduceOp {
value[i] = ops.combine(value[i], next[i]);
}
}
#endif
}
value = block_y_reduce<output_vec_size>(value, shared_memory);
if (config.should_block_x_reduce()) {
Expand Down
13 changes: 5 additions & 8 deletions cmake/public/LoadHIP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,16 @@ if(HIP_FOUND)
# hip (lower-case) package. Both are probed above and will be in
# ROCM_INCLUDE_DIRS if available.
find_file(ROCM_VERSION_HEADER_PATH
NAMES rocm-core/rocm_version.h
NAMES rocm-core/rocm_version.h hip/hip_version.h
NO_DEFAULT_PATH
PATHS ${ROCM_INCLUDE_DIRS}
)
set(ROCM_LIB_NAME "ROCM")
if(NOT ROCM_VERSION_HEADER_PATH)
find_file(ROCM_VERSION_HEADER_PATH
NAMES hip/hip_version.h
NO_DEFAULT_PATH
PATHS ${ROCM_INCLUDE_DIRS}
)
if(ROCM_VERSION_HEADER_PATH MATCHES "rocm-core/rocm_version.h$")
set(ROCM_LIB_NAME "ROCM")
else()
set(ROCM_LIB_NAME "HIP")
endif()

if(NOT ROCM_VERSION_HEADER_PATH)
message(FATAL_ERROR "Could not find hip/hip_version.h or rocm-core/rocm_version.h in ${ROCM_INCLUDE_DIRS}")
endif()
Expand Down
8 changes: 8 additions & 0 deletions test/distributed/test_c10d_nccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,14 @@ def _helper_test_extra_cuda_context_by_memory(self):
"""
device = torch.device(f"cuda:{self.rank:d}")
x = torch.empty((1,), device=device)

# We need this barrier to ensure that all nodes have completed init_process_group
# If rank=0 gets a mem snapshot before other nodes have finished init_process_group,
# then we artificially see a bump in memory usage. As per the following comment,
# we are going to be moving away from this function:
# https://github.com/pytorch/pytorch/pull/154174#discussion_r2105065931
c10d.barrier()

# Rank 0 takes a snapshot before collective -- this snapshot should have
# included rank 0's own context.
if self.rank == 0:
Expand Down
6 changes: 5 additions & 1 deletion test/inductor/test_flex_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
)
from torch.testing import FileCheck
from torch.testing._internal import common_utils
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
from torch.testing._internal.common_cuda import (
PLATFORM_SUPPORTS_BF16,
PLATFORM_SUPPORTS_FLASH_ATTENTION,
)
from torch.testing._internal.common_device_type import (
flex_attention_supported_platform as supported_platform,
instantiate_device_type_tests,
Expand Down Expand Up @@ -1582,6 +1585,7 @@ def mask_mod(b, h, q, kv):
self.assertEqual(out[:, :, M:, :].sum(), 0)

@supported_platform
@unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
def test_windowed_no_mask_vs_sdpa(self, device):
score_mod = _generate_windowed(1000)
attention = functools.partial(flex_attention, score_mod=score_mod)
Expand Down
9 changes: 8 additions & 1 deletion test/inductor/test_max_autotune.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,15 @@
)
from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
from torch.testing._internal.common_device_type import largeTensorTest
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
IS_WINDOWS,
parametrize,
TEST_WITH_ROCM,
MI300_ARCH,
runOnRocmArch,
skipIfXpu,
)
from torch.testing._internal.logging_utils import multiple_logs_to_string
from torch.utils._triton import has_triton_tma_device
Expand All @@ -54,7 +58,6 @@
from torch._inductor.virtualized import V
from torch.fx.experimental.proxy_tensor import make_fx
from torch.testing import FileCheck
from torch.testing._internal.common_utils import MI300_ARCH, runOnRocmArch, skipIfXpu
from torch.testing._internal.inductor_utils import (
get_func_call,
get_kernel_launch,
Expand Down Expand Up @@ -804,6 +807,8 @@ def test_conv_backend(self):

self.assertIn("NoValidChoicesError", str(context.exception))

# Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
@largeTensorTest("30 GB", device=GPU_TYPE)
def test_non_contiguous_input_mm(self):
"""
Make sure the triton template can work with non-contiguous inputs without crash.
Expand Down Expand Up @@ -856,6 +861,8 @@ def f(x, y):
# TODO: fix accuracy failure of the triton template on XPU.
# and enable this test case.
@skipIfXpu
# Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
@largeTensorTest("30 GB", device=GPU_TYPE)
def test_non_contiguous_input_mm_plus_mm(self):
x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)
Expand Down
71 changes: 70 additions & 1 deletion torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ typedef __half half;
)";
#endif

#if defined(USE_ROCM)
#if defined(USE_ROCM) && ROCM_VERSION < 70000
constexpr auto bfloat16_support_literal =
R"(
#ifndef __align__
Expand Down Expand Up @@ -317,6 +317,75 @@ __device__ __nv_bfloat16 __float2bfloat16(const float a) {
return val;
}

__device__ float __bfloat162float(const __nv_bfloat16 a) {
union
{
uint32_t int32;
float fp32;
} u = {uint32_t(a.__x) << 16};
return u.fp32;
}
#endif /* defined(__cplusplus) */
)";
#elif defined(USE_ROCM) && ROCM_VERSION >= 70000
constexpr auto bfloat16_support_literal =
R"(
#ifndef __align__
#define __align__(x) __attribute__((aligned(x)))
#endif

typedef unsigned int uint32_t;

typedef struct __align__(2) {
unsigned short x;
}
__nv_bfloat16_raw;

#if defined(__cplusplus)
struct __align__(2) __nv_bfloat16 {
__host__ __device__ __nv_bfloat16() {}

__host__ __device__ __nv_bfloat16& operator=(const __nv_bfloat16_raw& hr) {
__x = hr.x;
return *this;
}

unsigned short __x;
};

__device__ unsigned short __internal_float2bfloat16(
const float f,
unsigned int& sign,
unsigned int& remainder) {
unsigned int x;

x = __float_as_uint(f);

if ((x & 0x7fffffffU) > 0x7f800000U) {
sign = 0U;
remainder = 0U;
return static_cast<unsigned short>(0x7fffU);
}
sign = x >> 31;
remainder = x << 16;
return static_cast<unsigned short>(x >> 16);
}

/* Definitions of intrinsics */
__device__ __nv_bfloat16 __float2bfloat16(const float a) {
__nv_bfloat16 val;
__nv_bfloat16_raw r;
unsigned int sign;
unsigned int remainder;
r.x = __internal_float2bfloat16(a, sign, remainder);
if ((remainder > 0x80000000U) ||
((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
r.x++;
}
val = r;
return val;
}

__device__ float __bfloat162float(const __nv_bfloat16 a) {
union
{
Expand Down