From 9a0f380f3d4733ce3424d99f314789d30efe9dee Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Tue, 11 Feb 2025 17:14:35 +0530 Subject: [PATCH] [NVPTX] Add tcgen05.cp/shift intrinsics This patch adds intrinsics for tcgen05.cp and tcgen05.shift instructions. lit tests are added and verified with a ptxas-12.8 executable. Docs are updated in the NVPTXUsage.rst file. Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 87 ++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 32 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 42 +++ llvm/test/CodeGen/NVPTX/tcgen05-cp.ll | 348 +++++++++++++++++++++++ llvm/test/CodeGen/NVPTX/tcgen05-shift.ll | 23 ++ 5 files changed, 532 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-cp.ll create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-shift.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 8550af456e961..675b458c41e7b 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1183,6 +1183,93 @@ operations. For more information, refer to the PTX ISA ``_. +'``llvm.nvvm.tcgen05.shift``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.shift.{cg1/cg2}``' intrinsics correspond to +the ``tcgen05.shift.{cg1/cg2}`` PTX instructions. The ``tcgen05.shift`` +is an asynchronous instruction which initiates the shifting of 32-byte +elements downwards across all the rows, except the last, by one row. +The address operand ``%tmem_addr`` specifies the base address of the +matrix in the Tensor Memory whose rows must be down shifted. + +For more information, refer to the PTX ISA +``_. + +'``llvm.nvvm.tcgen05.cp``' +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.cp.4x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.cp.{shape}.{src_fmt}.{cg1/cg2}``' intrinsics +correspond to the ``tcgen05.cp.*`` family of PTX instructions. +The ``tcgen05.cp`` instruction initiates an asynchronous copy operation from +shared memory to the location specified by ``%tmem_addr`` in Tensor Memory. +The 64-bit register operand ``%sdesc`` is the matrix descriptor representing +the source matrix in shared memory that needs to be copied. + +The valid shapes for the copy operation are: +{128x256b, 4x256b, 128x128b, 64x128b_warpx2_02_13, 64x128b_warpx2_01_23, 32x128b_warpx4}. + +Shapes ``64x128b`` and ``32x128b`` require dedicated multicast qualifiers, +which are appended to the corresponding intrinsic names. + +Optionally, the data can be decompressed from the source format in the shared memory +to the destination format in Tensor Memory during the copy operation. Currently, +only ``.b8x16`` is supported as destination format. The valid source formats are +``.b6x16_p32`` and ``.b4x16_p64``. + +When the source format is ``.b6x16_p32``, a contiguous set of 16 elements of 6-bits +each followed by four bytes of padding (``_p32``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +When the source format is ``.b4x16_p64``, a contiguous set of 16 elements of 4-bits +each followed by eight bytes of padding (``_p64``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +For more information on the decompression schemes, refer to the PTX ISA +``_. + +For more information on the tcgen05.cp instruction, refer to the PTX ISA +``_. Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7ef270f3256a6..c32bf0318b5d6 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -55,6 +55,14 @@ def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr // MISC // +// Helper class that concatenates list elements with +// a given separator 'sep' and returns the result. +// Handles empty strings. +class StrJoin str_list> { + string ret = !foldl("", str_list, a, b, + !if(!eq(a, ""), b, !if(!eq(b, ""), a, !strconcat(a, sep, b)))); +} + // Helper class that represents a 'fragment' of an NVPTX *MMA instruction. // Geom: mnk. E.g. m8n32k16 // Frag: [a|b|c|d] ([x1|x2|x4] for ldmatrix) @@ -5140,6 +5148,11 @@ foreach cta_group = ["cg1", "cg2"] in { [llvm_shared_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>]>; + + def int_nvvm_tcgen05_shift_down_ # cta_group : Intrinsic<[], + [llvm_tmem_ptr_ty], // tmem_addr + [IntrConvergent, IntrArgMemOnly, + NoCapture>]>; } // Tcgen05 wait_ld/st intrinsics @@ -5154,4 +5167,23 @@ def int_nvvm_tcgen05_fence_before_thread_sync : Intrinsic<[], [], def int_nvvm_tcgen05_fence_after_thread_sync : Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; +// Tcgen05 cp intrinsics +foreach cta_group = ["cg1", "cg2"] in { + foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + foreach shape = ["128x256b", "4x256b", "128x128b", + "64x128b_warpx2_02_13", + "64x128b_warpx2_01_23", + "32x128b_warpx4"] in { + defvar intr_suffix = StrJoin<"_", [shape, src_fmt, cta_group]>.ret; + defvar name_suffix = StrJoin<".", [shape, src_fmt, cta_group]>.ret; + + def int_nvvm_tcgen05_cp_ # intr_suffix : Intrinsic<[], + [llvm_tmem_ptr_ty, // tmem_addr + llvm_i64_ty], // smem descriptor + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>], + "llvm.nvvm.tcgen05.cp." # name_suffix>; + } + } +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f20502521829e..ed7963f35a7c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7704,6 +7704,48 @@ defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR; +multiclass TCGEN05_SHIFT_INTR { + def NAME : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr), + !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"), + [(Intr Int32Regs:$tmem_addr)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; +defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; + +multiclass TCGEN05_CP_INTR { + defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16"); + defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret; + defvar fmt_intr = StrJoin<"_", [src_fmt]>.ret; + + defvar shape_mc_asm = StrJoin<".", [shape, mc]>.ret; + defvar shape_mc_intr = !subst("::", "_", !subst(".", "_", shape_mc_asm)); + + defvar intr_prefix = StrJoin<"_", ["int_nvvm_tcgen05_cp", shape_mc_intr, fmt_intr]>.ret; + defvar IntrCG1 = !cast(intr_prefix # "_cg1"); + defvar IntrCG2 = !cast(intr_prefix # "_cg2"); + + def NAME # _cg1 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; + def NAME # _cg2 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; +} + +foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + defm TCGEN05_CP_128x256b # src_fmt : TCGEN05_CP_INTR<"128x256b", src_fmt>; + defm TCGEN05_CP_4x256b # src_fmt : TCGEN05_CP_INTR<"4x256b", src_fmt>; + defm TCGEN05_CP_128x128b # src_fmt : TCGEN05_CP_INTR<"128x128b", src_fmt>; + defm TCGEN05_CP_64x128_1 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::02_13">; + defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">; + defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">; +} } // isConvergent let hasSideEffects = 1 in { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll new file mode 100644 index 0000000000000..50dc93325c286 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1 +define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2 +define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128 +define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + + +; CHECK-LABEL: test_tcgen05_cp_128x128b +define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x256b +define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b +define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b6x16_p32 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32 +define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32 +define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32 +define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32 +define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32 +define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32 +define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b4x16_p64 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64 +define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64 +define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64 +define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64 +define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64 +define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64 +define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll new file mode 100644 index 0000000000000..13a45b9d86dcf --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) +declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +; CHECK-LABEL: test_tcgen05_shift +define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) { +; CHECK-LABEL: test_tcgen05_shift( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_shift_param_0]; +; CHECK-NEXT: tcgen05.shift.cta_group::1.down [%r1]; +; CHECK-NEXT: tcgen05.shift.cta_group::2.down [%r1]; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + call void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + + ret void +}