diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 2152de9709dc6..b19632535b3e1 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -633,7 +633,7 @@ specified by the ``i32 %d0 ... i32 %d4`` arguments. For more information, refer PTX ISA ``_. -'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' +'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[3-5]d``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -648,7 +648,7 @@ Syntax: Overview: """"""""" -The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics +The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[3-5]d``' intrinsics correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set of PTX instructions. These instructions initiate an asynchronous prefetch of tensor data from global memory to the L2 cache. In im2col mode, some @@ -663,6 +663,76 @@ the same functionality as described in the ``tile`` mode intrinsics above. For more information, refer PTX ISA ``_. +'``llvm.nvvm.cp.async.bulk.tensor.reduce.[red_op].tile.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce..tile.2d(..., i32 %d0, i32 %d1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce..tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce..tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce..tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.reduce..tile.[1-5]d``' intrinsics +correspond to the ``cp.reduce.async.bulk.tensor.[1-5]d.*`` set of PTX instructions. +These instructions initiate an asynchronous reduction operation of tensor data +in global memory with the tensor data in shared{::cta} memory, using ``tile`` mode. +The dimension of the tensor data ranges from 1d to 5d with the coordinates +specified by the ``i32 %d0 ... i32 %d4`` arguments. The supported reduction +operations are {add, min, max, inc, dec, and, or, xor} as described in the +``tile.1d`` intrinsics. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.reduce.[red_op].im2col.[3-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce..im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce..im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.reduce..im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.reduce..im2col.[3-5]d``' intrinsics +correspond to the ``cp.reduce.async.bulk.tensor.[3-5]d.*`` set of PTX instructions. +These instructions initiate an asynchronous reduction operation of tensor data +in global memory with the tensor data in shared{::cta} memory, using ``im2col`` mode. +In this mode, the tensor has to be at least three-dimensional. The supported reduction +operations supported are the same as the ones in the tile mode. The last argument to +these intrinsics is a boolean flag, with the same functionality as described in the +``tile`` mode intrinsics above. + +For more information, refer PTX ISA +``_. + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 115fcee0b04f2..f91ee9fc619e5 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -635,6 +635,25 @@ class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { ImmArg>]; } +class CP_ASYNC_BULK_TENSOR_REDUCE_INTR { + string Suffix = op # "_" # mode # "_" # dim # "d"; + string Name = "int_nvvm_cp_async_bulk_tensor_reduce_" # Suffix; + + list TensorDimsTy = !listsplat(llvm_i32_ty, dim); + list ArgsTy = !listconcat( + [llvm_shared_ptr_ty, // src_smem_ptr + llvm_ptr_ty], // tensormap_ptr + TensorDimsTy, // actual tensor dims + [llvm_i64_ty, // cache_hint + llvm_i1_ty] // Flag for cache_hint + ); + int FlagsStartIdx = !add(dim, 3); + list IntrProp = [IntrConvergent, + ReadOnly>, ReadOnly>, + NoCapture>, NoCapture>, + ImmArg>]; +} + let TargetPrefix = "nvvm" in { def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], @@ -4929,4 +4948,14 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +// Intrinsics for TMA Copy with reduction +foreach dim = [1, 2, 3, 4, 5] in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in { + foreach reduce = [CP_ASYNC_BULK_TENSOR_REDUCE_INTR] in + def reduce.Name : DefaultAttrsIntrinsic<[], reduce.ArgsTy, reduce.IntrProp>; + } + } +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h new file mode 100644 index 0000000000000..43dde42bbbd62 --- /dev/null +++ b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h @@ -0,0 +1,37 @@ +//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file contains the definitions of the enumerations and flags +/// associated with NVVM Intrinsics. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_NVVMINTRINSICFLAGS_H +#define LLVM_IR_NVVMINTRINSICFLAGS_H + +namespace llvm { +namespace nvvm { + +// Reduction Ops supported with TMA Copy from Shared +// to Global Memory for the "cp.reduce.async.bulk.tensor.*" +// family of PTX instructions. +enum class TMAReductionOp : uint8_t { + ADD = 0, + MIN = 1, + MAX = 2, + INC = 3, + DEC = 4, + AND = 5, + OR = 6, + XOR = 7, +}; + +} // namespace nvvm +} // namespace llvm +#endif // LLVM_IR_NVVMINTRINSICFLAGS_H diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 7af3f76249d61..9d27ca689a215 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -14,6 +14,7 @@ #include "NVPTX.h" #include "NVPTXUtilities.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/NVVMIntrinsicFlags.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -416,3 +417,40 @@ void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum, return; } } + +void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum, + raw_ostream &O, + const char *Modifier) { + const MCOperand &MO = MI->getOperand(OpNum); + using RedTy = llvm::nvvm::TMAReductionOp; + + switch (static_cast(MO.getImm())) { + case RedTy::ADD: + O << ".add"; + return; + case RedTy::MIN: + O << ".min"; + return; + case RedTy::MAX: + O << ".max"; + return; + case RedTy::INC: + O << ".inc"; + return; + case RedTy::DEC: + O << ".dec"; + return; + case RedTy::AND: + O << ".and"; + return; + case RedTy::OR: + O << ".or"; + return; + case RedTy::XOR: + O << ".xor"; + return; + default: + llvm_unreachable( + "Invalid Reduction Op in printCpAsyncBulkTensorReductionMode"); + } +} diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index 2ce40bd6e8b97..2b19386ef17fe 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -54,6 +54,8 @@ class NVPTXInstPrinter : public MCInstPrinter { raw_ostream &O, const char *Modifier = nullptr); void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O, const char *Modifier = nullptr); + void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O, + const char *Modifier = nullptr); }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 3b03ec67dc8ce..e1fb2d7fcee03 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/NVVMIntrinsicFlags.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -4152,40 +4153,51 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const { bool NVPTXScopes::empty() const { return Scopes.size() == 0; } -#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, suffix) \ - (IsShared32 \ +#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \ + (is_s32 \ ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \ : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix) -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode) \ - (IsCacheHint ? (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, _CH)) \ - : (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, ))) +#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \ + (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \ + : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, ))) -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode) \ +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \ + is_s32) \ + (is_reduce \ + ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \ + : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \ + is_s32))) + +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \ [&]() -> auto { \ - if (IsMultiCast && IsCacheHint) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC_CH); \ - if (IsCacheHint) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _CH); \ - if (IsMultiCast) \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC); \ - return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \ + if (is_mc && is_ch) \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \ + if (is_ch) \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \ + if (is_mc) \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \ + return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \ }() -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode) \ - (IsCacheHint ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \ - : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode) +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \ + (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \ + : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode) static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, - bool IsCacheHint, bool IsIm2Col) { + bool IsCacheHint, bool IsIm2Col, + bool IsReduce = false) { if (IsIm2Col) { switch (Dim) { case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce, + IsCacheHint, IsShared32); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce, + IsCacheHint, IsShared32); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce, + IsCacheHint, IsShared32); default: llvm_unreachable("Invalid Dimension in im2col mode for " "GetCpAsyncBulkTensorS2GOpcode."); @@ -4193,15 +4205,20 @@ static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, } else { switch (Dim) { case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce, + IsCacheHint, IsShared32); case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce, + IsCacheHint, IsShared32); case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce, + IsCacheHint, IsShared32); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce, + IsCacheHint, IsShared32); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce, + IsCacheHint, IsShared32); default: llvm_unreachable( "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode."); @@ -4215,11 +4232,14 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, if (IsIm2Col) { switch (Dim) { case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast, + IsCacheHint, IsShared32); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast, + IsCacheHint, IsShared32); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast, + IsCacheHint, IsShared32); default: llvm_unreachable("Invalid Dimension in im2col mode for " "GetCpAsyncBulkTensorG2SOpcode."); @@ -4227,15 +4247,20 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, } else { switch (Dim) { case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast, + IsCacheHint, IsShared32); case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast, + IsCacheHint, IsShared32); case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast, + IsCacheHint, IsShared32); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast, + IsCacheHint, IsShared32); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast, + IsCacheHint, IsShared32); default: llvm_unreachable( "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode."); @@ -4248,11 +4273,11 @@ static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, if (IsIm2Col) { switch (Dim) { case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint); default: llvm_unreachable("Invalid Dimension in im2col mode for " "GetCpAsyncBulkTensorPrefetchOpcode."); @@ -4260,15 +4285,15 @@ static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, } else { switch (Dim) { case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint); case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint); case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint); default: llvm_unreachable("Invalid Dimension in tile mode for " "GetCpAsyncBulkTensorPrefetchOpcode."); @@ -4377,8 +4402,34 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } +void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, + unsigned RedOp, + bool IsIm2Col) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {4 + dims} + size_t NumOps = N->getNumOperands(); + size_t NumDims = NumOps - 6; + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumArgs)); + Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op + Ops.push_back(N->getOperand(0)); // Chain operand + + bool IsShared32 = + CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; + unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode( + NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); + using TMARedTy = llvm::nvvm::TMAReductionOp; + auto CastTy = [](TMARedTy Op) { return static_cast(Op); }; switch (IID) { default: return false; @@ -4418,5 +4469,109 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true); return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD), + /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN), + /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX), + /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC), + /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC), + /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND), + /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR), + /*IsIm2Col=*/true); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR)); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d: + SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR), + /*IsIm2Col=*/true); + return true; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index d6c80a31b7463..8cc270a682900 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -95,6 +95,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); + void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, + bool IsIm2Col = false); + inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 5878940812f62..6839fe2da1574 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -564,17 +564,19 @@ foreach dim = [1, 2, 3, 4, 5] in { } // From Shared to Global memory (S2G) -class S2G_STRINGS { - string prefix = "cp.async.bulk.tensor"; +class S2G_STRINGS { string dir = "global.shared::cta"; string completion = "bulk_group"; - string inst_name = prefix + string inst_name = !if(is_reduce, "cp.reduce", "cp") + # ".async.bulk.tensor" # "." # dim # "d" # "." # dir # "." # mode # "." # completion # !if(ch, ".L2::cache_hint", ""); - string intr_name = "CP_ASYNC_BULK_TENSOR_S2G_" + string intr_name = "CP_ASYNC_BULK_TENSOR_" + # !if(is_reduce, "RED_", "S2G_") # dim # "D" # !if(is_shared32, "_SHARED32", "") # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); @@ -596,11 +598,37 @@ multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR { Requires<[hasPTX<80>, hasSM<90>]>; } +def TMAReductionFlags : Operand { + let PrintMethod = "printTmaReductionMode"; +} + +// TMA Copy from Shared to Global memory with Reduction +multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR { + defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i)); + defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; + defvar rc = !if(shared32, Int32Regs, Int64Regs); + + defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta"; + defvar suffix = "." # mode # ".bulk_group"; + + def NAME: NVPTXInst<(outs), + !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)), + !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def NAME # _CH: NVPTXInst<(outs), + !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch, TMAReductionFlags:$red_op)), + !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; +} + foreach dim = [1, 2, 3, 4, 5] in { foreach shared32 = [true, false] in { foreach mode = !if(!ge(dim, 3), ["tile", "im2col_no_offs"], ["tile"]) in { defm S2G_STRINGS.intr_name : CP_ASYNC_BULK_TENSOR_S2G_INTR; + defm S2G_STRINGS.intr_name : + CP_ASYNC_BULK_TENSOR_REDUCE_INTR; } } } diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll new file mode 100644 index 0000000000000..38c9090bc6b25 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll @@ -0,0 +1,426 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.1d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i64 %ch, i1 %flag_ch); +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %flag_ch); +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch); +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch); +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch); + +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch); +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch); +declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch); + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_1d +define void @cp_async_bulk_tensor_reduce_tile_1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_1d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_2d +define void @cp_async_bulk_tensor_reduce_tile_2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_2d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_3d +define void @cp_async_bulk_tensor_reduce_tile_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_3d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_4d +define void @cp_async_bulk_tensor_reduce_tile_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_4d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_5d +define void @cp_async_bulk_tensor_reduce_tile_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_5d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5]; +; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_3d +define void @cp_async_bulk_tensor_reduce_im2col_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_3d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_4d +define void @cp_async_bulk_tensor_reduce_im2col_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_4d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_5d +define void @cp_async_bulk_tensor_reduce_im2col_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_5d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5]; +; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6]; +; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + ret void +}