diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index dec6ad4e54115..dcd0a3ac3639b 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1060,6 +1060,81 @@ flavors of the instruction respectively. For more information, refer to the PTX ISA ``_. +'``llvm.nvvm.tcgen05.commit``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.commit.{cg1,cg2}(ptr %mbar) + declare void @llvm.nvvm.tcgen05.commit.shared.{cg1,cg2}(ptr addrspace(3) %mbar) + declare void @llvm.nvvm.tcgen05.commit.mc.{cg1,cg2}(ptr %mbar, i16 %mc) + declare void @llvm.nvvm.tcgen05.commit.mc.shared.{cg1,cg2}(ptr addrspace(3) %mbar, i16 %mc) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.commit.*``' intrinsics correspond to the +``tcgen05.commit.{cg1/cg2}.mbarrier::arrive::one.*`` set of PTX instructions. +The ``tcgen05.commit`` is an asynchronous instruction which makes the mbarrier +object (``%mbar``) track the completion of all prior asynchronous tcgen05 operations. +The ``.mc`` variants allow signaling on the mbarrier objects of multiple CTAs +(specified by ``%mc``) in the cluster. The ``.cg1`` and ``.cg2`` variants generate +``cta_group::1`` and ``cta_group::2`` flavors of the instruction respectively. + +For more information, refer to the PTX ISA +``_. + +'``llvm.nvvm.tcgen05.wait``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.wait.ld() + declare void @llvm.nvvm.tcgen05.wait.st() + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.wait.ld/st``' intrinsics correspond to +the ``tcgen05.wait::{ld/st}.sync.aligned`` pair of PTX instructions. +The ``tcgen05.wait::ld`` causes the executing thread to block until +all prior ``tcgen05.ld`` operations issued by the executing thread +have completed. The ``tcgen05.wait::st`` causes the executing thread +to block until all prior ``tcgen05.st`` operations issued by the +executing thread have completed. + +For more information, refer to the PTX ISA +``_. + +'``llvm.nvvm.tcgen05.fence``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.fence.before.thread.sync() + declare void @llvm.nvvm.tcgen05.fence.after.thread.sync() + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.fence.*``' intrinsics correspond to +the ``tcgen05.fence::{before/after}_thread_sync`` pair of PTX instructions. +These instructions act as code motion fences for asynchronous tcgen05 +operations. + +For more information, refer to the PTX ISA +``_. + + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index abbe25bf0040a..f299a145ac73b 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -5083,6 +5083,38 @@ foreach cta_group = ["cg1", "cg2"] in { def int_nvvm_tcgen05_relinq_alloc_permit_ # cta_group : Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly]>; + + def int_nvvm_tcgen05_commit_ # cta_group : Intrinsic<[], + [llvm_ptr_ty], // mbar_ptr + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, + NoCapture>]>; + + def int_nvvm_tcgen05_commit_shared_ # cta_group : Intrinsic<[], + [llvm_shared_ptr_ty], // mbar_ptr + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, + NoCapture>]>; + + def int_nvvm_tcgen05_commit_mc_ # cta_group : Intrinsic<[], + [llvm_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, + NoCapture>]>; + + def int_nvvm_tcgen05_commit_mc_shared_ # cta_group : Intrinsic<[], + [llvm_shared_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, + NoCapture>]>; } +// Tcgen05 wait_ld/st intrinsics +def int_nvvm_tcgen05_wait_ld : Intrinsic<[], [], + [IntrConvergent, IntrInaccessibleMemOnly]>; +def int_nvvm_tcgen05_wait_st : Intrinsic<[], [], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// Tcgen05 Fence intrinsics +def int_nvvm_tcgen05_fence_before_thread_sync : Intrinsic<[], [], + [IntrNoMem, IntrHasSideEffects]>; +def int_nvvm_tcgen05_fence_after_thread_sync : Intrinsic<[], [], + [IntrNoMem, IntrHasSideEffects]>; + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index a0d00e4aac560..cdd723cad69c5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7622,4 +7622,51 @@ multiclass TCGEN05_RELINQ_PERMIT_INTR { defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>; defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>; +def tcgen05_wait_ld: NVPTXInst<(outs), (ins), "tcgen05.wait::ld.sync.aligned;", + [(int_nvvm_tcgen05_wait_ld)]>, + Requires<[hasTcgen05Instructions]>; + +def tcgen05_wait_st: NVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligned;", + [(int_nvvm_tcgen05_wait_st)]>, + Requires<[hasTcgen05Instructions]>; + +multiclass TCGEN05_COMMIT_INTR { + defvar prefix = "tcgen05.commit.cta_group::" # num; + defvar suffix = ".mbarrier::arrive::one.shared::cluster"; + + defvar intr_suffix = !if(!eq(AS, "shared"), "_shared", "") # "_cg" # num; + defvar Intr = !cast("int_nvvm_tcgen05_commit" # intr_suffix); + defvar IntrMC = !cast("int_nvvm_tcgen05_commit_mc" # intr_suffix); + + def NAME : NVPTXInst<(outs), (ins rc:$mbar), + !strconcat(prefix, suffix, ".b64 [$mbar];"), + [(Intr rc:$mbar)]>, + Requires<[hasTcgen05Instructions]>; + def NAME # _MC : NVPTXInst<(outs), (ins rc:$mbar, Int16Regs:$mc), + !strconcat(prefix, suffix, ".multicast::cluster.b64 [$mbar], $mc;"), + [(IntrMC rc:$mbar, Int16Regs:$mc)]>, + Requires<[hasTcgen05Instructions]>; +} + +defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR; +defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR; +defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR; +defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR; +defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR; +defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR; + } // isConvergent + +let hasSideEffects = 1 in { + +def tcgen05_fence_before_thread_sync: NVPTXInst<(outs), (ins), + "tcgen05.fence::before_thread_sync;", + [(int_nvvm_tcgen05_fence_before_thread_sync)]>, + Requires<[hasTcgen05Instructions]>; + +def tcgen05_fence_after_thread_sync: NVPTXInst<(outs), (ins), + "tcgen05.fence::after_thread_sync;", + [(int_nvvm_tcgen05_fence_after_thread_sync)]>, + Requires<[hasTcgen05Instructions]>; + +} // hasSideEffects diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll new file mode 100644 index 0000000000000..6e0ec6bcf4465 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} + +declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr) +declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr) +declare void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr) +declare void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr) + +; CHECK-LABEL: test_tcgen05_commit +define void @test_tcgen05_commit(ptr %bar_addr) { +; CHECK_PTX64-LABEL: test_tcgen05_commit( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_param_0]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_param_0]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr) + + call void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr) + + ret void +} + +; CHECK-LABEL: test_tcgen05_commit_shared +define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_shared( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_shared_param_0]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_commit_shared_param_0]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%r1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1]; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr) + + call void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr) + + ret void +} + +declare void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %bar_addr, i16 %cta_mask) +declare void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask) +declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask) +declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask) + +; CHECK-LABEL: test_tcgen05_commit_mc +define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_mc( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0]; +; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %bar_addr, i16 %cta_mask) + + call void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask) + + ret void +} + +; CHECK-LABEL: test_tcgen05_commit_mc_shared +define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_mask) { +; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_shared_param_0]; +; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_commit_mc_shared_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask) + + call void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll new file mode 100644 index 0000000000000..07c62671d2fbd --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +declare void @llvm.nvvm.tcgen05.fence.before.thread.sync() +declare void @llvm.nvvm.tcgen05.fence.after.thread.sync() +declare void @llvm.nvvm.tcgen05.wait.ld() +declare void @llvm.nvvm.tcgen05.wait.st() + +; CHECK-LABEL: test_tcgen05_fence +define void @test_tcgen05_fence() { +; CHECK-LABEL: test_tcgen05_fence( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: tcgen05.fence::before_thread_sync; +; CHECK-NEXT: tcgen05.fence::after_thread_sync; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.fence.before.thread.sync() + + call void @llvm.nvvm.tcgen05.fence.after.thread.sync() + + ret void +} + +; CHECK-LABEL: test_tcgen05_wait +define void @test_tcgen05_wait() { +; CHECK-LABEL: test_tcgen05_wait( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: tcgen05.wait::ld.sync.aligned; +; CHECK-NEXT: tcgen05.wait::st.sync.aligned; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.wait.ld() + + call void @llvm.nvvm.tcgen05.wait.st() + + ret void +}