From 4842b279493eea9087ceedab8344e637f42d568e Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Date: Tue, 1 Jul 2025 07:20:14 +0000 Subject: [PATCH] [NVVM][NVPTX] Add support for tcgen05.mma This commit adds support for tcgen05.mma instructions in NVPTX which tests under CodeGen/NVPTX/tcgen05-mma*. This tcgen05.mma instructions are modeled as intrinsics with multiple arguments to model cta_group, mma kind, collector usage etc. The rationale for the design is present documented in NVPTXUsage.rst file --- llvm/docs/NVPTXUsage.rst | 464 +++++++++++- llvm/include/llvm/IR/IntrinsicsNVVM.td | 213 +++++- llvm/include/llvm/IR/NVVMIntrinsicUtils.h | 9 + llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 193 +++++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 27 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 422 ++++++++++- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 2 +- .../NVPTX/tcgen05-mma-block-scale-ptx88.ll | 526 ++++++++++++++ .../CodeGen/NVPTX/tcgen05-mma-block-scale.ll | 291 ++++++++ .../NVPTX/tcgen05-mma-disable-output-lane.ll | 681 ++++++++++++++++++ .../test/CodeGen/NVPTX/tcgen05-mma-invalid.ll | 4 + .../test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll | 412 +++++++++++ llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll | 569 +++++++++++++++ llvm/test/CodeGen/NVPTX/tcgen05-mma.ll | 601 ++++++++++++++++ 14 files changed, 4402 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-ptx88.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-disable-output-lane.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-invalid.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index d28eb6860c33a..5d8fe59128400 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1945,6 +1945,464 @@ The last argument `i1 %unpack` is a compile-time constant which when set, indica For more information, refer to the `PTX ISA `__. +tcgen05.mma Intrinsics +---------------------- + +Overview +^^^^^^^^ + +`tcgen05.mma` operation of shape `M x N x K` perform matrix multiplication and +accumulation of the form: `D = A * B + D` where: + + - the `A` matrix has shape `M x K`, in either `Tensor Memory` or `Shared Memory` + - the `B` matrix has shape `K x N`, in `Shared Memory` of the current CTA and, optionally in peer CTA + - the `D` matrix is of the shape `M x N`, in `Tensor Memory` + +Optionally an input predicate can be used to disable the input (`%enable_inp_d`) +from the accumulator matrix and the following operation can be performed as `D = A * B` + +The matrix multiplication and accumulation operations are categorized into various +kinds based on input types and the throughput of the multiplication operation. +The following table shows the different kinds of MMA operations that are supported: + ++------------+--------------------------------------------+ +| .kind | Supported Input Types | ++============+============================================+ +| f16 | F16 and BF16 | ++------------+--------------------------------------------+ +| tf32 | TF32 | ++------------+--------------------------------------------+ +| f8f6f4 | All combinations of F8, F6, and F4 | ++------------+--------------------------------------------+ +| i8 | Signed and Unsigned 8-bit Integers | ++------------+--------------------------------------------+ +| mxf8f6f4 | MX-floating point formats | ++------------+--------------------------------------------+ +| mxf4 | MX-floating point formats (FP4) | ++------------+--------------------------------------------+ +| mxf4nvf4 | MXF4 + custom NVIDIA 4-bit floating point | +| | (with common scaling factor) | ++------------+--------------------------------------------+ + +`tcgen05.mma.sp` supports sparse variant of `A` with shape `M x K` stored in packed +form as `M X (K / 2)` in memory. The `%spmetadata` specifies the mapping of the +`K / 2` non-zero elements to the `K` elements before performing the MMA operation. + +`tcgen05.mma.block_scale` perform matrix multiplication with block scaling +`D = (A * scale_A) * (B * scale_B) + D` where scaling of input matrices from +memory to form the matrix `A` and matrix `B` before performing the MMA operation. +Scale factors for `A` and `B` matrices need to be duplicated to all 32 lane partitions +of tensor memory. The shape of `%scale_a` and `%scale_b` matrices depend on the +`.scale_vectorsize` described in `here `__ + +The sparsity metadata (`%spmetadata`) as well as the block-scale inputs for `A / B` +matrices (`%scale_a` and `%scale_b`) reside in Tensor Memory. + +To facilitate opportunistic re-use of `A / B` matrix data across a sequence of MMA +operations, the `A/B` matrices are loaded into a collector buffer +(`%collector_usage_a_op_flag`, `%collector_usage_b_buffer_flag`, and `%collector_usage_b_op_flag`). +The flag value of the collector_usage flag in the intrinsic specifies the nature of the re-use + +There are three kinds of matrix descriptors used by the tcgen05 family of instructions: + ++----------------------------+-----------------------------------------------------------------------------------------------------------+-------------+ +| Descriptor | Description | Size (bits) | ++============================+===========================================================================================================+=============+ +| Shared Memory Descriptor | Describes properties of multiplicand matrix | | +| | in shared memory, including its location | | +| | within the CTA's shared memory. | 64 | +| | `PTX ISA `__ | | ++----------------------------+-----------------------------------------------+-------------+---------------------------------------------+-------------+ +| Instruction Descriptor | Describes shapes, types, and details of | | +| | all matrices and the MMA operation. | 32 | +| | `PTX ISA `__ | | ++----------------------------+-----------------------------------------------+-------------+---------------------------------------------+-------------+ +| Zero-Column Mask Descriptor| Generates a mask specifying which columns of | | +| | B matrix are zeroed in the MMA operation, | | +| | regardless of values in shared memory. | 64 | +| | Total mask size = N bits | | +| | `PTX ISA `__ | | ++----------------------------+-----------------------------------------------+-------------+---------------------------------------------+-------------+ + +`tcgen05.mma` can be used for general matrix multiplication or for convolution operations. +In case of convolutions, the `activations` can be stored in either matrix `A` or matrix `B` +while the `weights` will be stored in the other matrix + +`tcgen05.mma` has an optional collector qualifier to specify when an `A` or `B` matrix +is new to the sequence and should be loaded, unchanged within the sequence and, +should be reused, or the last use in the sequence and should be discarded. +The collector qualifier is used to give the TensorCore permission to reuse a +previously loaded `A` or `B` matrix; however reuse is opportunistic in that the +TensorCore may reload a matrix even when it has permission to reuse that matrix. +Thus, the source memory of an A or B matrix must not be modified while the MMA +instruction using those matrices has not completed - regardless of collector +qualifier permissions. + +The `cta_group::1` specifies that the operation is performed on the Tensor Memory +of the executing thread’s CTA only. The `cta_group::2` specifies that the MMA +operation is performed on the Tensor Memory of the executing thread’s CTA and its peer CTA. + +The vector operand `%disable_output_lane` specifies the lane(s) in the Tensor Memory +that should be not be updated with the resultant matrix D. Elements of the vector operand +disable-output-lane forms a mask where each bit corresponds to a lane of the Tensor Memory, +with least significant bit of the first element of the vector (leftmost in syntax) +corresponding to the lane 0 of the Tensor Memory. If a bit in the mask is 1, then +the corresponding lane in the Tensor Memory for the resultant matrix D will not be +updated + +Intrinsic Design: +^^^^^^^^^^^^^^^^^ + +Given the broad feature set of `tcgen05.mma` instruction modeling these +through intrinsics is highly complex, and the following table outlines the large +number of intrinsics required to fully support the `tcgen05.mma` instruction set. + ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| variant | Configuration | Total Variants | ++====================================+===================================================================================================+================+ +| tcgen05.mma.shared | 2 (space) x 2 (sp) x 4 (kind) x 2 (cta_group) x 4 (collector_usage) | 128 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.tensor.ashift | 2 (sp) x 4 (kind) x 2 (cta_group) x 2 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.scale_d | 2 (space) x 2 (sp) x 2 (kind) x 2 (cta_group) x 4 (collector_usage) | 128 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.scale_d.tensor.ashift | 2 (sp) x 2 (kind) x 2 (cta_group) x 2 (collector_usage) | 16 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.disable_output_lane | 2 (space) x 2 (sp) x 4 (kind) x 2 (cta_group) x 4 (collector_usage) | 128 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.disable_output_lane... | 2 (sp) x 4 (kind) x 2 (cta_group) x 2 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.block_scale | 2 (space) x 1 (mxf4nvf4) x 2 (cta_group) x 2 (scale_vec_size) x 4 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.block_scale | 2 (space) x 1 (mxf4) x 2 (cta_group) x 2 (scale_vec_size) x 4 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.block_scale | 2 (space) x 1 (mxf8f6f4) x 2 (cta_group) x 2 (scale_vec_size) x 4 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.ws | 2 (space) x 2 (sp) x 4 (kind) x 2 (zero_col_mask) x 4 (collector_usage_op) x 4 (collector_buffer) | 256 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| Total | | 816 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ + + +To reduce the number of possible intrinsic variations, we've modeled the `tcgen05.mma` +instructions using flag operands. We've added range checks to these flags to prevent +invalid values. We also expanded some flags back into intrinsic modifiers to avoid +supporting invalid combinations of features. + + +'``llvm.nvvm.tcgen05.mma.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_a_op_flag) + + ; .sp variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_a_op_flag) + + ; .scale_d variants + declare void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group_flag, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.scale_d<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group_flag, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; sp.scale_d variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group, i32 %collector_usage_a_op_flag) + +Overview: +""""""""" + +`nvvm.tcgen05.mma` is an asynchronous intrinsic which initiates an `M x N x K` matrix +multiply and accumulate operation, `D = A * B + D` where the `A` matrix is `M x K`, +the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The operation of the form +`D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. +The optional immediate argument `%scale_d_imm` can be specified to scale the input +matrix `D` as follows: `D = A * B + D * (2 ^ - %scale_d_imm)`. The valid range of +values for argument `%scale_d_imm` is `[0, 15]`. The 32-bit register operand idesc +is the instruction descriptor as described in `Instruction descriptor `__ + +`nvvm.tcgen05.mma` has single thread semantics, unlike the collective instructions +`nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing +the `nvvm.tcgen05.mma` will result in the initiation of the whole matrix and accumulate +operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K/2)` and requires +specifiying an additional `%spmetadata` argument + +`.ashift` shifts the rows of the A matrix down by one row, except for the last row +in the Tensor Memory. `.ashift` is only allowed with M = 128 or M = 256. + +The `%collector_usage_a_op_flag` flag specifies the usage of collector buffer for +matrix `A`. It is illegal to specify either of `USE` or `FILL` for `%collector_usage_a_op_flag` +along with `.ashift` + +For more information, refer to the +`PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%kind_flag` flag: + +============= ========== + `kind_flag` value +============= ========== + F16 0 + TF32 1 + F8F6F4 2 + I8 3 +============= ========== + +`%cta_group_flag` flag: + +================= ========== + `cta_group_flag` value +================= ========== + CG1 1 + CG2 2 +================= ========== + +`%collector_usage_a_op_flag` flag: + +============================= ========== + `collector_usage_a_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + +'``llvm.nvvm.tcgen05.mma.block_scale*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + ; mxf8f6f4 + declare void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + + ; mxf4 + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + + ; mxf4nvf4 + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %d, ptr i64 %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %d, ptr i64 %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + +Overview: +""""""""" +`nvvm.tcgen05.mma.block_scale` is an asynchronous intrinsic which initiates an `M x N x K` matrix multiply and accumulate operation, `D = (A * scale_a) * (B * scale_a) + D` where the `A` matrix is `M x K`, the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The matrices `A` and `B` are scaled with `%scale_A` and `%scale_B` matrices respectively before performing the matrix multiply and accumulate operation. The operation of the form `D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. The 32-bit register operand idesc is the instruction descriptor as described in `Instruction descriptor `__ + +`nvvm.tcgen05.mma.block_scale` has single thread semantics, unlike the collective instructions `nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing the `nvvm.tcgen05.mma.block_scale` will result in the initiation of the whole matrix multiply and accumulate operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K / 2)` and requires specifiying an additional `%spmetadata` argument + +The `%collector_usage_a_op_flag` flag specifies the usage of collector buffer for matrix `A` + +For more information, refer to the +`PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%cta_group`: + +============= ========== + `cta_group` value +============= ========== + CG1 1 + CG2 2 +============= ========== + +`%collector_usage_a_op_flag`: + +============================= ========== + `collector_usage_a_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + +'``llvm.nvvm.tcgen05.mma.disable_output_lane*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; .sp variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; .scale_d variants + declare void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, %i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, %i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, %i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, %i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; .sp.scale_d variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, %i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, %i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, %i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, %i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + +Overview: +""""""""" + +`nvvm.tcgen05.mma.disable_output_lane` is an asynchronous intrinsic which initiates an `M x N x K` matrix multiply and accumulate operation, `D = A * B + D` where the `A` matrix is `M x K`, the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The operation of the form `D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. The optional immediate argument `%scale_d_imm` can be specified to scale the input matrix `D` as follows: `D = A*B+D * (2 ^ - %scale_d_imm)`. The valid range of values for argument `%scale_d_imm` is `[0, 15]`. The 32-bit register operand idesc is the instruction descriptor as described in `Instruction descriptor `__ + +The vector operand `%disable_output_lane` specifies the lane(s) in the Tensor Memory that should be not be updated with the resultant matrix `D`. Elements of the vector operand `%disable_output_lane` forms a mask where each bit corresponds to a lane of the Tensor Memory, with least significant bit of the first element of the vector corresponding to the `lane 0` of the Tensor Memory. If a bit in the mask is 1, then the corresponding lane in the Tensor Memory for the resultant matrix `D` will not be updated + +`nvvm.tcgen05.mma.disable_output_lane` has single thread semantics, unlike the collective instructions `nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing the `nvvm.tcgen05.mma.disable_output_lane` will result in the initiation of the whole matrix multiply and accumulate operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K / 2)` and requires specifiying an additional `%spmetadata` argument + +`.ashift` shifts the rows of the A matrix down by one row, except for the last row in the Tensor Memory. `.ashift` is only allowed with M = 128 or M = 256. + +The `%collector_usage_a_op_flag` flag specifies the usage of collector buffer for matrix `A`. It is illegal to specify either of `USE` or `FILL` for `%collector_usage_a_op_flag` along with `.ashift` + +For more information, refer to the `PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%kind_flag`: + +============= ========== + `kind_flag` value +============= ========== + F16 0 + TF32 1 + F8F6F4 2 + I8 3 +============= ========== + +`%cta_group_flag`: + +================= ========== + `cta_group_flag` value +================= ========== + CG1 1 + CG2 2 +================= ========== + +`%collector_usage_a_op_flag`: + +============================= ========== + `collector_usage_a_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + + +'``llvm.nvvm.tcgen05.mma.ws*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + // tcgen05.mma.ws + declare void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %d, addrspace(3) %ashared, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %d, addrspace(3) %ashared, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + + ; .sp variants + declare void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %d, addrspace(3) %ashared, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %d, addrspace(3) %ashared, addrspace(3) %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %d, ptr addrspace(6) %a, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + +Overview: +""""""""" + +`nvvm.tcgen05.mma.ws` is an asynchronous intrinsic which initiates an `M x N x K` weight stationary convolution matrix multiply and accumulate operation, `D = A * B + D` where the `A` matrix is `M x K`, the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The operation of the form `D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. The optional immediate argument `%scale_d_imm` can be specified to scale the input matrix `D` as follows: `D = A*B+D * (2 ^ - %scale_d_imm)`. The valid range of values for argument `%scale_d_imm` is `[0, 15]`. The 32-bit register operand idesc is the instruction descriptor as described in `Instruction descriptor `__ + +`nvvm.tcgen05.mma` has single thread semantics, unlike the collective instructions `nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing the `nvvm.tcgen05.mma` will result in the initiation of the whole matrix multiply and accumulate operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K / 2)` and requires specifiying an additional `%spmetadata` argument + +The operand `%zero_col_mask` is a 64-bit register which specifies the `Zero-Column Mask Descriptor `__. The zero-column mask descriptor is used to generate a mask that specifies which columns of `B` matrix will have zero value for the matrix multiply and accumulate operation regardless of the values present in the shared memory. + +The `%collector_usage_b_buffer_flag` and `%collector_usage_b_op_flag` together flag specifies the usage of collector buffer for Matrix `B` + +For more information, refer to the +`PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%kind_flag`: + +============= ========== + `kind_flag` value +============= ========== + F16 0 + TF32 1 + F8F6F4 2 + I8 3 +============= ========== + +`%collector_usage_b_buffer_flag`: + +================================ ========== + `collector_usage_b_buffer_flag` value +================================ ========== + B0 0 + B1 1 + B2 2 + B3 3 +================================ ========== + +`%collector_usage_b_op_flag`: + +============================= ========== + `collector_usage_b_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + Store Intrinsics ---------------- @@ -2012,7 +2470,7 @@ The completion of the write of each local response is tracked by independent mbarriers at the corresponding shared memory location of each CTA in the cluster. -For more information, refer `PTX ISA `__. +For more information, refer `PTX ISA `__. '``llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -2038,7 +2496,7 @@ it returns ``1`` (true). A true result indicates that: - no other successful response of another ``try_cancel`` request in the grid will contain the first CTA id of that cluster -For more information, refer `PTX ISA `__. +For more information, refer `PTX ISA `__. '``llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid.*``' Intrinsics @@ -2067,7 +2525,7 @@ If the request succeeded: If the request failed, the behavior of these intrinsics is undefined. -For more information, refer `PTX ISA `__. +For more information, refer `PTX ISA `__. Perf Monitor Event Intrinsics ----------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 967d1663f237b..20c7541e6f684 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -764,6 +764,76 @@ class NVVM_TCGEN05_LDST_ACCESS_SIZE { true : llvm_void_ty); } +class NVVM_TCGEN05_MMA_BASE { + LLVMType a_operand_type = !if(!eq(Space, "tensor"), + llvm_tmem_ptr_ty, llvm_i64_ty); + list common_args = [llvm_tmem_ptr_ty, // d + a_operand_type, // a + llvm_i64_ty, // b + llvm_i32_ty, // idesc + llvm_i1_ty]; // enable_input_d + list common_intr_props = !listconcat( + [IntrArgMemOnly, WriteOnly>], + !if(!eq(Space, "tensor"), [ReadOnly>], []) + ); +} + +class NVVM_TCGEN05_MMA: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # !if(!eq(ScaleInputD, 1), ".scale_d", "") + # !if(!eq(AShift, 1), ".ashift", ""); + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_BLOCKSCALE: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # "." # Kind # ScaleVecSize + # ".block_scale"; + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_WS: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma.ws" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # !if(!eq(ZeroColMask, 1), ".zero_col_mask", ""); + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_DISABLE_OUTPUT_LANE: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # !if(!eq(ScaleInputD, 1), ".scale_d", "") + # ".disable_output_lane.cg" # CtaGroup + # !if(!eq(AShift, 1), ".ashift", ""); + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED { + bit ret = !cond( + !and(!eq(Kind, "mxf8f6f4"), !eq(ScaleVecSize, "")) : true, + !and(!eq(Kind, "mxf4"), !eq(ScaleVecSize, "")) : true, + !and(!eq(Kind, "mxf4nvf4"), !eq(ScaleVecSize, ".block16")) : true, + !and(!eq(Kind, "mxf4"), !eq(ScaleVecSize, ".block32")) : true, + !and(!eq(Kind, "mxf4nvf4"), !eq(ScaleVecSize, ".block32")) : true, + !and(!eq(Kind, "mxf8f6f4"), !eq(ScaleVecSize, ".block32")) : true, + true: false + ); +} + class TexVector types> { string Name = name; list Types = types; @@ -2070,13 +2140,15 @@ def int_nvvm_exit : NVVMBuiltin, class DefaultAttrsIntrinsicFlags ret_types, list param_types, list flags, - list intr_properties> + list intr_properties, + string name = ""> : DefaultAttrsIntrinsic< ret_types, !listconcat(param_types, flags), !listconcat(intr_properties, !foreach(i, !range(flags), - ImmArg>))>; + ImmArg>)), + name>; // TMA Tensor Copy Intrinsics: S2G -> From Shared to Global memory variants foreach dim = 1...5 in { @@ -2464,4 +2536,139 @@ def int_nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_ # dim "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>; } -} // let TargetPrefix = "nvvm" +// +// tcgen05.mma intrinsics +// + +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach scale_d = [0, 1] in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + defvar mma = NVVM_TCGEN05_MMA; + defvar args = !listconcat( + mma.common_args, + !if(!eq(sp, 1), [llvm_tmem_ptr_ty], []), + !if(!eq(scale_d, 1), [llvm_i64_ty], []) + ); + defvar flags = [llvm_i32_ty, // kind + llvm_i32_ty, // cta_group + llvm_i32_ty]; // collector_usage_a + defvar nargs = !size(args); + defvar scale_d_imm = ArgIndex; + defvar intrinsic_properties = !listconcat( + mma.common_intr_props, + !if(!eq(scale_d, 1), + [ImmArg, Range], []), + [Range, 0, !if(!eq(scale_d, 1), 2, 4)>, + Range, 1, 3>, + Range, 0, !if(!eq(ashift, 1), 2, 4)>] + ); + + def mma.record: + DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties, + mma.intr>; + } + } + } +} + +// +// tcgen05.mma disable_output_lane intrinsics +// +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach cta_group = [1, 2] in { + foreach scale_d = [0, 1] in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + defvar mma = NVVM_TCGEN05_MMA_DISABLE_OUTPUT_LANE< + sp, space, cta_group, ashift, scale_d>; + defvar disable_output_lane_type = + !if(!eq(cta_group, 1), llvm_v4i32_ty, llvm_v8i32_ty); + defvar args = !listconcat( + mma.common_args, + !if(!eq(sp, 1), [llvm_tmem_ptr_ty], []), + !if(!eq(scale_d, 1), [llvm_i64_ty], []), + [disable_output_lane_type] + ); + defvar flags = [llvm_i32_ty, // kind_flag + llvm_i32_ty]; // collector_usage_a_flag + defvar nargs = !size(args); + defvar scale_d_flag = ArgIndex; + defvar scale_d_imm_range = [ImmArg, Range]; + defvar intrinsic_properties = !listconcat( + mma.common_intr_props, + !if(!eq(scale_d, 1), scale_d_imm_range, []), + [Range, 0, !if(!eq(scale_d, 1), 2, 4)>, + Range, 0, !if(!eq(ashift, 1), 2, 4)>] + ); + + def mma.record: DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties, + mma.intr>; + } // ashift + } // scale_d + } // cta_group + } // space +} // sp + +// +// tcgen05.mma block_scale intrinsics +// +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["mxf8f6f4", "mxf4", "mxf4nvf4"] in { + foreach scale_vec_size = ["", ".block16", ".block32"] in { + defvar mma = NVVM_TCGEN05_MMA_BLOCKSCALE; + defvar cta_group = ArgIndex; + defvar collector_usage = ArgIndex; + + if NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED.ret then { + def mma.record: DefaultAttrsIntrinsicFlags<[], + !listconcat(mma.common_args, + !if(!eq(sp, 1), + [llvm_tmem_ptr_ty], []), // spmetadata + [llvm_tmem_ptr_ty, // scale a + llvm_tmem_ptr_ty]), // scale b + // flags + [llvm_i32_ty, // cta_group + llvm_i32_ty], // collector_usage_a + !listconcat(mma.common_intr_props, + [Range, + Range]), + mma.intr>; + } + } + } + } +} + +// +// tcgen05.mma ws intrinsics +// +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach zero_col_mask = [0, 1] in { + defvar mma = NVVM_TCGEN05_MMA_WS; + defvar args = !listconcat( + mma.common_args, + !if(!eq(sp, 1), [llvm_tmem_ptr_ty], []), + !if(!eq(zero_col_mask, 1), [llvm_i64_ty], []) + ); + defvar flags = [llvm_i32_ty, // kind + llvm_i32_ty, // collector_buffer_b + llvm_i32_ty]; // collector_usage_b_op + defvar nargs = !size(args); + defvar intrinsic_properties = !listconcat( + mma.common_intr_props, + [Range, 0, 4>, + Range, 0, 4>, + Range, 0, 4>] + ); + + def mma.record: + DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties, + mma.intr>; + } + } +} + +} // let TargetPrefix = "nvvm" \ No newline at end of file diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h index 11bfd733a8854..f18e55c744124 100644 --- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h +++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h @@ -47,6 +47,15 @@ enum class CTAGroupKind : uint8_t { CG_2 = 2, // cta_group::2 modifier }; +enum class Tcgen05MMAKind : uint8_t { F16 = 0, TF32 = 1, F8F6F4 = 2, I8 = 3 }; + +enum class Tcgen05CollectorUsageOp : uint8_t { + DISCARD = 0, + LASTUSE = 1, + FILL = 2, + USE = 3, +}; + inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) { switch (IntrinsicID) { case Intrinsic::nvvm_f2i_rm_ftz: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 15f45a1f35e2f..4d32eb40e0325 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1157,6 +1157,34 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X) MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y) MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT) } return nullptr; @@ -2507,6 +2535,95 @@ static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG) { return Tcgen05StNode; } +static unsigned getTcgen05MMADisableOutputLane(unsigned IID) { + switch (IID) { + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: + return NVPTXISD:: + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: + return NVPTXISD:: + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + }; + llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic"); +} + +static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG) { + SDNode *N = Op.getNode(); + SDLoc DL(N); + unsigned IID = cast(N->getOperand(1))->getZExtValue(); + + SmallVector Ops; + // split the vector argument + for (size_t I = 0; I < N->getNumOperands(); I++) { + if (I == 1) + continue; // skip IID + SDValue Val = N->getOperand(I); + EVT ValVT = Val.getValueType(); + if (ValVT.isVector()) { + EVT EltVT = ValVT.getVectorElementType(); + for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++) + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, + DAG.getIntPtrConstant(J, DL))); + } else + Ops.push_back(Val); + } + + MemIntrinsicSDNode *MemSD = cast(N); + SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode( + getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); + + return Tcgen05MMANode; +} + static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); SDValue Intrin = N->getOperand(1); @@ -2554,6 +2671,35 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { case Intrinsic::nvvm_tcgen05_st_32x32b_x64: case Intrinsic::nvvm_tcgen05_st_32x32b_x128: return LowerTcgen05St(Op, DAG); + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: + return LowerTcgen05MMADisableOutputLane(Op, DAG); } return Op; } @@ -4530,6 +4676,53 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align.reset(); return true; } + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: { + // We are reading and writing back to TMem + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::v4i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.align = Align(16); + return true; + } + + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: { + // We are reading and writing back to TMem + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::v8i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.align = Align(16); + return true; + } } return false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index cf72a1e6db89c..2e50d334517c8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -84,7 +84,32 @@ enum NodeType : unsigned { StoreV2, StoreV4, StoreV8, - LAST_MEMORY_OPCODE = StoreV8, + TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + LAST_MEMORY_OPCODE = + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT, }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d33719236b172..c17272c7c694a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -5125,7 +5125,7 @@ let Predicates = [hasSM<90>, hasPTX<78>] in { def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>; // Tcgen05 intrinsics -let isConvergent = true, Predicates = [hasTcgen05Instructions] in { +let isConvergent = true, Predicates = [hasTcgen05Instructions, hasPTX<86>] in { multiclass TCGEN05_ALLOC_INTR { def "" : BasicNVPTXInst<(outs), @@ -5221,7 +5221,7 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { } } // isConvergent -let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in { +let hasSideEffects = 1, Predicates = [hasTcgen05Instructions, hasPTX<86>] in { def tcgen05_fence_before_thread_sync: NullaryInst< "tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>; @@ -5256,7 +5256,7 @@ class TCGEN05_LDST_REGINFO { class TCGEN05_LD_INST : NVPTXInst<(outs), (ins), "?", []>, - Requires<[hasTcgen05Instructions]> { + Requires<[hasTcgen05Instructions, hasPTX<86>]> { TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO< NVVM_TCGEN05_LDST_ACCESS_SIZE.veclen>; @@ -5281,7 +5281,7 @@ class TCGEN05_LD_INST : class TCGEN05_ST_INST : NVPTXInst<(outs), (ins), "?", []>, - Requires<[hasTcgen05Instructions]> { + Requires<[hasTcgen05Instructions, hasPTX<86>]> { TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO< NVVM_TCGEN05_LDST_ACCESS_SIZE.veclen>; @@ -5385,3 +5385,417 @@ foreach dim = ["x", "y", "z"] in { def CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_ # dim: CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID; } + +// +// tcgen05.mma Instructions +// + +class Tcgen05MMAInst : + NVPTXInst<(outs), (ins), "?", []>, + Requires<[hasTcgen05Instructions, hasPTX<86>]> { + + Intrinsic Intrin = !cast( + NVVM_TCGEN05_MMA.record + ); + + dag ScaleInpIns = !if(!eq(ScaleInputD, 1), (ins i64imm:$scale_input_d), (ins)); + string ScaleInpStr = !if(!eq(ScaleInputD, 1), ", $scale_input_d", ""); + dag ScaleInpInput = !if(!eq(ScaleInputD, 1), (Intrin i64:$scale_input_d), (Intrin)); + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin B32:$spmetadata), (Intrin)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(KindStr, "f16"): 0, + !eq(KindStr, "tf32"): 1, + !eq(KindStr, "f8f6f4"): 2, + !eq(KindStr, "i8"): 3, + ); + + int CollectorUsageVal = !cond( + !eq(CollectorUsage, "discard"): 0, + !eq(CollectorUsage, "lastuse"): 1, + !eq(CollectorUsage, "fill"): 2, + !eq(CollectorUsage, "use"): 3 + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag input = !con((ins B32:$dtmem, + ARegClass:$a, ADDR:$b, + B32:$idesc, + B1:$enable_inp_d), + SparseMetadataIns, + ScaleInpIns); + + let InOperandList = input; + let OutOperandList = (outs); + let AsmString = "tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::" # CtaGroup + # ".kind::" # KindStr + # ".collector::a::" # CollectorUsage + # !if(!eq(AShift, 1), ".ashift", "") + # " [$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", $idesc, $enable_inp_d" + # ScaleInpStr + # ";"; + + dag IntrinsicPattern = !con((Intrin i32:$dtmem, + ARegClass:$a, addr:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + ScaleInpInput); + + dag FlagOperands = (Intrin (i32 KindVal), (i32 CtaGroup), + (i32 CollectorUsageVal)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["f16", "tf32", "f8f6f4", "i8"] in { + foreach cta_group = [1, 2] in { + foreach collector_usage = ["discard", "lastuse", "fill", "use"] in { + foreach scale_input_d = [0, 1] in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + defvar ScaleInputD = !and(!eq(scale_input_d, 1), + !or(!eq(kind, "f16"), + !eq(kind, "tf32"))); + + def : Tcgen05MMAInst; + } + } + } + } + } + } +} + +class Tcgen05MMADisableOutputLaneTypeProfile: + SDTypeProfile<0, 0, []> { + int DisableOutputLaneVecSize = !mul(4, CtaGroup); + + list VTs = !listconcat( + [i32], // d + !if(!eq(ASpace, "tensor"), [i32], [i64]), // a + [i64, i32, i1], // b, idesc, enable_inp_d + !if(!eq(Sp, 1), [i32], []), // spmetadata + !if(!eq(ScaleInputD, 1), [i64], []), // scale_input_d + !listsplat(i32, DisableOutputLaneVecSize), // disable_output_lane + [i32, i32] // kind, collector_usage + ); + let Constraints = !foreach(x, !range(!size(VTs)), SDTCisVT); + let NumOperands = !size(Constraints); +} + +class Tcgen05MMADisableOutputLaneSDNode: + SDNode<"NVPTXISD::TCGEN05_MMA" + # !if(!eq(Sp, 1), "_SP", "") + # "_" # !toupper(ASpace) + # !if(!eq(ScaleInput, 1), "_SCALE_D", "") + # "_DISABLE_OUTPUT_LANE_CG" # CtaGroup + # !if(!eq(AShift, 1), "_ASHIFT", ""), + Tcgen05MMADisableOutputLaneTypeProfile, + [SDNPHasChain, SDNPSideEffect]>; + +class Tcgen05MMADisableOutputLaneInst : + NVPTXInst<(outs), (ins), "?", []>, + Requires<[hasTcgen05Instructions, hasPTX<86>]> { + + SDNode Opcode = Tcgen05MMADisableOutputLaneSDNode; + + + dag ScaleInpIns = !if(!eq(ScaleInputD, 1), (ins i64imm:$scale_input_d), (ins)); + string ScaleInpStr = !if(!eq(ScaleInputD, 1), ", $scale_input_d", ""); + dag ScaleInpInput = !if(!eq(ScaleInputD, 1), (Opcode i64:$scale_input_d), (Opcode)); + + // disable output lane + int DisableOutputLaneVecSize = !mul(4, CtaGroup); + + dag DisableOutputLaneIns = !dag(ins, + !listsplat(B32, DisableOutputLaneVecSize), + !foreach(x, + !range(DisableOutputLaneVecSize), + "disable_output_lane" # x)); + + dag DisableOutputLaneInput = !dag(Opcode, + !listsplat(i32, DisableOutputLaneVecSize), + !foreach(x, + !range(DisableOutputLaneVecSize), + "disable_output_lane" # x)); + + string DisableOutputLaneStr = "{{" # + !interleave( + !foreach(x, + !range(DisableOutputLaneVecSize), + "$disable_output_lane" # x), + ", ") + # "}}"; + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Opcode i32:$spmetadata), (Opcode)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(Kind, "f16"): 0, + !eq(Kind, "tf32"): 1, + !eq(Kind, "f8f6f4"): 2, + !eq(Kind, "i8"): 3, + ); + + int CollectorUsage = !cond( + !eq(CollectorUsageStr, "discard"): 0, + !eq(CollectorUsageStr, "lastuse"): 1, + !eq(CollectorUsageStr, "fill"): 2, + !eq(CollectorUsageStr, "use"): 3, + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag InOperandList = !con((ins B32:$dtmem, + ARegClass:$a, B64:$b, + B32:$idesc, + B1:$enable_inp_d), + SparseMetadataIns, + ScaleInpIns, + DisableOutputLaneIns); + + let OutOperandList = (outs); + let AsmString = "tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::" # CtaGroup + # ".kind::" # Kind + # !if(!eq(AShift, 1), ".ashift", "") + # ".collector::a::" # CollectorUsageStr + # " " # "[$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", " # "$idesc" + # ", " # DisableOutputLaneStr + # ", $enable_inp_d" + # ScaleInpStr + # ";"; + + dag IntrinsicPattern = !con((Opcode i32:$dtmem, + ARegClass:$a, i64:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + ScaleInpInput, + DisableOutputLaneInput); + + dag FlagOperands = (Opcode (i32 KindVal), (i32 CollectorUsage)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma.disable_output_lane +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["f16", "tf32", "f8f6f4", "i8"] in { + foreach cta_group = [1, 2] in { + foreach collector_usage = ["fill", "use", "lastuse", "discard"] in { + foreach scale_input_d = [0, 1] in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + defvar ScaleInputD = !and(!eq(scale_input_d, 1), + !or(!eq(kind, "f16"), + !eq(kind, "tf32"))); + def : + Tcgen05MMADisableOutputLaneInst; + } + } + } + } + } + } +} + +class Tcgen05MMABlockScaleInst: + NVPTXInst<(outs), (ins), "?", []>, + Requires<[hasTcgen05Instructions, PTXPredicate]> { + + Intrinsic Intrin = !cast( + NVVM_TCGEN05_MMA_BLOCKSCALE.record); + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin i32:$spmetadata), (Intrin)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(KindStr, "mxf8f6f4") : 0, + !eq(KindStr, "mxf4") : 1, + !eq(KindStr, "mxf4nvf4") : 2, + ); + + int CollectorUsage = !cond( + !eq(CollectorUsageStr, "discard") : 0, + !eq(CollectorUsageStr, "lastuse") : 1, + !eq(CollectorUsageStr, "fill") : 2, + !eq(CollectorUsageStr, "use") : 3, + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag input = !con((ins B32:$dtmem, ARegClass:$a, B64:$b, + B32:$idesc, B1:$enable_inp_d), + SparseMetadataIns, + (ins B32:$scale_a, + B32:$scale_b)); + + let InOperandList = input; + let OutOperandList = (outs); + let AsmString = "tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::" # CtaGroup + # ".kind::" # KindStr + # ".block_scale" # ScaleVecSize + # ".collector::a::" # CollectorUsageStr + # " [$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", $idesc, [$scale_a], [$scale_b], $enable_inp_d;"; + + dag IntrinsicPattern = !con((Intrin i32:$dtmem, + ARegClass:$a, i64:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + (Intrin i32:$scale_a, + i32:$scale_b)); + + dag FlagOperands = (Intrin (i32 CtaGroup), (i32 CollectorUsage)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma.block_scale +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["mxf8f6f4", "mxf4", "mxf4nvf4"] in { + foreach scale_vec_size = ["", ".block16", ".block32"] in { + foreach cta_group = [1, 2] in { + foreach collector_usage = ["fill", "use", "lastuse", "discard"] in { + if NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED.ret then { + def : Tcgen05MMABlockScaleInst, hasPTX<86>)>; + } + } + } + } + } + } +} + +// +// tcgen05.mma.ws Instructions +// + +class Tcgen05MMAWSInst : + NVPTXInst<(outs), (ins), "?", []>, + Requires<[hasTcgen05Instructions, hasPTX<86>]> { + + Intrinsic Intrin = !cast( + NVVM_TCGEN05_MMA_WS.record); + + dag ZeroColMaskIns = !if(!eq(HasZeroColMask, 1), + (ins B64:$zero_col_mask), (ins)); + string ZeroColMaskStr = !if(!eq(HasZeroColMask, 1), ", $zero_col_mask", ""); + dag ZeroColMaskIntr = !if(!eq(HasZeroColMask, 1), + (Intrin i64:$zero_col_mask), (Intrin)); + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin B32:$spmetadata), (Intrin)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(KindStr, "f16") : 0, + !eq(KindStr, "tf32") : 1, + !eq(KindStr, "f8f6f4"): 2, + !eq(KindStr, "i8") : 3, + ); + + int CollectorUsageOp = !cond( + !eq(CollectorUsageOpStr, "discard"): 0, + !eq(CollectorUsageOpStr, "lastuse"): 1, + !eq(CollectorUsageOpStr, "fill") : 2, + !eq(CollectorUsageOpStr, "use") : 3, + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag input = !con((ins B32:$dtmem, + ARegClass:$a, B64:$b, + B32:$idesc, + B1:$enable_inp_d), + SparseMetadataIns, + ZeroColMaskIns); + + let InOperandList = input; + let OutOperandList = (outs); + let AsmString = "tcgen05.mma.ws" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::1" + # ".kind::" # KindStr + # ".collector::b" # CollectorBufferB + # "::" # CollectorUsageOpStr + # " [$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", $idesc, $enable_inp_d" + # ZeroColMaskStr + # ";"; + + dag IntrinsicPattern = !con((Intrin i32:$dtmem, + ARegClass:$a, i64:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + ZeroColMaskIntr); + + dag FlagOperands = (Intrin (i32 KindVal), (i32 CollectorBufferB), + (i32 CollectorUsageOp)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma.ws +foreach sp = [0, 1] in { + foreach space = ["shared", "tensor"] in { + foreach kind = ["f16", "tf32", "f8f6f4", "i8"] in { + foreach collector_buffer_b = [0, 1, 2, 3] in { + foreach collector_usage_op = ["discard", "fill", "use", "lastuse"] in { + foreach zero_col_mask = [0, 1] in { + def : Tcgen05MMAWSInst; + } + } + } + } + } +} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 81af55edccadb..889fa2da81304 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -114,7 +114,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { break; } - return HasTcgen05 && PTXVersion >= 86; + return HasTcgen05; } // f32x2 instructions in Blackwell family bool hasF32x2Instructions() const { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-ptx88.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-ptx88.ll new file mode 100644 index 0000000000000..cfaacc5822e14 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-ptx88.ll @@ -0,0 +1,526 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} + +define void @tcgen05_mma_mxf8f6f4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf8f6f4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_mxf8f6f4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_mxf4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_mxf4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_mxf4nvf4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4nvf4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4nvf4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4nvf4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4nvf4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4nvf4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4nvf4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4nvf4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4nvf4_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4nvf4_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf4nvf4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4nvf4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4nvf4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4nvf4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4nvf4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4nvf4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4nvf4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4nvf4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4nvf4_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4nvf4_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4nvf4_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block16.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block32.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale.ll new file mode 100644 index 0000000000000..18511989753cb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale.ll @@ -0,0 +1,291 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} + +define void @tcgen05_mma_mxf8f6f4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf8f6f4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf8f6f4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_mxf4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-disable-output-lane.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-disable-output-lane.ll new file mode 100644 index 0000000000000..1f7870e21297d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-disable-output-lane.ll @@ -0,0 +1,681 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} + +define void @tcgen05_mma_fp16_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_fp16_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_fp16_shared_disable_output_lane_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_fp16_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [tcgen05_mma_fp16_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [tcgen05_mma_fp16_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_fp16_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [tcgen05_mma_sp_fp16_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + ret void +} + +define void @tcgen05_mma_tf32_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_tf32_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_tf32_shared_disable_output_lane_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_tf32_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [tcgen05_mma_tf32_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [tcgen05_mma_tf32_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_tf32_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [tcgen05_mma_sp_tf32_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_f8f6f4_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_f8f6f4_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [tcgen05_mma_f8f6f4_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_f8f6f4_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_f8f6f4_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_i8_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_i8_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_i8_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_i8_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_i8_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_i8_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_i8_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_i8_shared_disable_output_lane_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_i8_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [tcgen05_mma_i8_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [tcgen05_mma_i8_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_i8_shared_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_i8_shared_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [tcgen05_mma_sp_i8_shared_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-invalid.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-invalid.ll new file mode 100644 index 0000000000000..a9638c2bc75f9 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-invalid.ll @@ -0,0 +1,4 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_100a -o /dev/null 2>&1 | FileCheck %s +; XFAIL: * +target triple = "nvptx64-nvidia-cuda" + diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll new file mode 100644 index 0000000000000..13a3f7a6a97e5 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll @@ -0,0 +1,412 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} + +define void @tcgen05_mma_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_fp16_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_tf32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_tf32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_tf32_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_tf32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_tf32_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_tf32_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_fp16_shared_scale_d_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_fp16_shared_scale_d_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + + ret void +} + +define void @tcgen05_mma_tf32_shared_scale_d_disable_output_lane(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_tf32_shared_scale_d_disable_output_lane( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll new file mode 100644 index 0000000000000..f12d8fb10aab3 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll @@ -0,0 +1,569 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} + +define void @tcgen05_mma_ws_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_fp16_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_fp16_zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask) { +; CHECK-LABEL: tcgen05_mma_ws_fp16_zero_col_mask( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_fp16_zero_col_mask_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_fp16_zero_col_mask_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_fp16_zero_col_mask_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_fp16_zero_col_mask_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_fp16_zero_col_mask_param_4]; +; CHECK-NEXT: ld.param.b64 %rd3, [tcgen05_mma_ws_fp16_zero_col_mask_param_6]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_fp16_zero_col_mask_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_sp_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta) { +; CHECK-LABEL: tcgen05_mma_ws_sp_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_sp_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_sp_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_sp_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_sp_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_sp_fp16_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_sp_fp16_param_6]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_ws_sp_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_sp_fp16_zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, ptr addrspace(6) %spmeta) { +; CHECK-LABEL: tcgen05_mma_ws_sp_fp16_zero_col_mask( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_4]; +; CHECK-NEXT: ld.param.b64 %rd3, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_6]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_7]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_tf32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_tf32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_tf32_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_tf32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_tf32_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_tf32_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_tf32_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_tf32_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_f8f6f4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_f8f6f4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_f8f6f4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_f8f6f4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_f8f6f4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_f8f6f4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_f8f6f4_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_f8f6f4_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_i8(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_i8_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_i8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_i8_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_i8_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_i8_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_i8_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll new file mode 100644 index 0000000000000..0a43bd5f91fed --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll @@ -0,0 +1,601 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} + +define void @tcgen05_mma_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_fp16_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_tf32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_tf32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_tf32_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_tf32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_tf32_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_tf32_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_f8f6f4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_f8f6f4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_f8f6f4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_f8f6f4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_f8f6f4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_f8f6f4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_f8f6f4_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_f8f6f4_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_f8f6fr(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_f8f6fr( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_f8f6fr_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_f8f6fr_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_f8f6fr_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_f8f6fr_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_f8f6fr_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_f8f6fr_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_f8f6fr_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_i8(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_i8_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_i8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_i8_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_i8_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_i8_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_i8_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_i8(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_i8_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_i8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_i8_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_i8_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_i8_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_i8_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_i8_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 3) + ret void +}