diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ac8ce05724750..ec654e0f3f200 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -648,9 +648,50 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, if (S == NVPTX::Scope::Cluster) T->failIfClustersUnsupported(".cluster scope fence"); + // Fall back to .acq_rel if .acquire, .release is not supported. + if (!T->hasSplitAcquireAndReleaseFences() && + (O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release)) + O = NVPTX::Ordering::AcquireRelease; + switch (O) { case NVPTX::Ordering::Acquire: + switch (S) { + case NVPTX::Scope::System: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys + : NVPTX::INT_MEMBAR_SYS; + case NVPTX::Scope::Block: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta + : NVPTX::INT_MEMBAR_CTA; + case NVPTX::Scope::Cluster: + return NVPTX::atomic_thread_fence_acquire_cluster; + case NVPTX::Scope::Device: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu + : NVPTX::INT_MEMBAR_GL; + case NVPTX::Scope::Thread: + report_fatal_error( + formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", + ScopeToString(S))); + } + break; case NVPTX::Ordering::Release: + switch (S) { + case NVPTX::Scope::System: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys + : NVPTX::INT_MEMBAR_SYS; + case NVPTX::Scope::Block: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta + : NVPTX::INT_MEMBAR_CTA; + case NVPTX::Scope::Cluster: + return NVPTX::atomic_thread_fence_release_cluster; + case NVPTX::Scope::Device: + return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu + : NVPTX::INT_MEMBAR_GL; + case NVPTX::Scope::Thread: + report_fatal_error( + formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", + ScopeToString(S))); + } + break; case NVPTX::Ordering::AcquireRelease: { switch (S) { case NVPTX::Scope::System: diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 633a99d0fc1be..74423d79e41e0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3866,33 +3866,16 @@ def : Pat < // PTX Fence instructions //////////////////////////////////////////////////////////////////////////////// -def atomic_thread_fence_seq_cst_sys : - NVPTXInst<(outs), (ins), "fence.sc.sys;", []>, - Requires<[hasPTX<60>, hasSM<70>]>; -def atomic_thread_fence_acq_rel_sys : - NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>, - Requires<[hasPTX<60>, hasSM<70>]>; - -def atomic_thread_fence_seq_cst_gpu : - NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>, - Requires<[hasPTX<60>, hasSM<70>]>; -def atomic_thread_fence_acq_rel_gpu : - NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>, - Requires<[hasPTX<60>, hasSM<70>]>; - -def atomic_thread_fence_seq_cst_cluster : - NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>, - Requires<[hasPTX<78>, hasSM<90>]>; -def atomic_thread_fence_acq_rel_cluster : - NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>, - Requires<[hasPTX<78>, hasSM<90>]>; - -def atomic_thread_fence_seq_cst_cta : - NVPTXInst<(outs), (ins), "fence.sc.cta;", []>, - Requires<[hasPTX<60>, hasSM<70>]>; -def atomic_thread_fence_acq_rel_cta : - NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>, - Requires<[hasPTX<60>, hasSM<70>]>; +class NVPTXFenceInst: + NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>, + Requires<[ptx, hasSM<70>]>; + +foreach scope = ["sys", "gpu", "cluster", "cta"] in { + def atomic_thread_fence_seq_cst_#scope: NVPTXFenceInst>; + def atomic_thread_fence_acq_rel_#scope: NVPTXFenceInst>; + def atomic_thread_fence_acquire_#scope: NVPTXFenceInst>; + def atomic_thread_fence_release_#scope: NVPTXFenceInst>; +} def fpimm_any_zero : FPImmLeaf= 70 && PTXVersion >= 60; } + // Does SM & PTX support .acquire and .release qualifiers for fence? + bool hasSplitAcquireAndReleaseFences() const { + return SmVersion >= 90 && PTXVersion >= 86; + } // Does SM & PTX support atomic relaxed MMIO operations ? bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } bool hasDotInstructions() const { diff --git a/llvm/test/CodeGen/NVPTX/fence-cluster.ll b/llvm/test/CodeGen/NVPTX/fence-cluster.ll new file mode 100644 index 0000000000000..697dce4f89515 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-cluster.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify %} + +define void @fence_acquire_cluster() { +; SM90-LABEL: fence_acquire_cluster( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acquire.cluster; +; SM90-NEXT: ret; + fence syncscope("cluster") acquire + ret void +} + + +define void @fence_release_cluster() { +; SM90-LABEL: fence_release_cluster( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.release.cluster; +; SM90-NEXT: ret; + fence syncscope("cluster") release + ret void +} + + +define void @fence_acq_rel_cluster() { +; SM90-LABEL: fence_acq_rel_cluster( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acq_rel.cluster; +; SM90-NEXT: ret; + fence syncscope("cluster") acq_rel + ret void +} + + +define void @fence_seq_cst_cluster() { +; SM90-LABEL: fence_seq_cst_cluster( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.sc.cluster; +; SM90-NEXT: ret; + fence syncscope("cluster") seq_cst + ret void +} + diff --git a/llvm/test/CodeGen/NVPTX/fence-nocluster.ll b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll new file mode 100644 index 0000000000000..e2bec72517d55 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx50 | FileCheck %s --check-prefix=SM30 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx50 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90 +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify %} + +define void @fence_acquire_sys() { +; SM30-LABEL: fence_acquire_sys( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.sys; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_acquire_sys( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_acquire_sys( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acquire.sys; +; SM90-NEXT: ret; + fence syncscope("") acquire + ret void +} + + +define void @fence_acquire_cta() { +; SM30-LABEL: fence_acquire_cta( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.cta; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_acquire_cta( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_acquire_cta( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acquire.cta; +; SM90-NEXT: ret; + fence syncscope("block") acquire + ret void +} + + +define void @fence_acquire_gpu() { +; SM30-LABEL: fence_acquire_gpu( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.gl; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_acquire_gpu( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_acquire_gpu( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acquire.gpu; +; SM90-NEXT: ret; + fence syncscope("device") acquire + ret void +} + + +define void @fence_release_sys() { +; SM30-LABEL: fence_release_sys( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.sys; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_release_sys( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_release_sys( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.release.sys; +; SM90-NEXT: ret; + fence syncscope("") release + ret void +} + + +define void @fence_release_cta() { +; SM30-LABEL: fence_release_cta( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.cta; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_release_cta( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_release_cta( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.release.cta; +; SM90-NEXT: ret; + fence syncscope("block") release + ret void +} + + +define void @fence_release_gpu() { +; SM30-LABEL: fence_release_gpu( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.gl; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_release_gpu( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_release_gpu( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.release.gpu; +; SM90-NEXT: ret; + fence syncscope("device") release + ret void +} + + +define void @fence_acq_rel_sys() { +; SM30-LABEL: fence_acq_rel_sys( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.sys; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_acq_rel_sys( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.sys; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_acq_rel_sys( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acq_rel.sys; +; SM90-NEXT: ret; + fence syncscope("") acq_rel + ret void +} + + +define void @fence_acq_rel_cta() { +; SM30-LABEL: fence_acq_rel_cta( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.cta; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_acq_rel_cta( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.cta; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_acq_rel_cta( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acq_rel.cta; +; SM90-NEXT: ret; + fence syncscope("block") acq_rel + ret void +} + + +define void @fence_acq_rel_gpu() { +; SM30-LABEL: fence_acq_rel_gpu( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.gl; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_acq_rel_gpu( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.acq_rel.gpu; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_acq_rel_gpu( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.acq_rel.gpu; +; SM90-NEXT: ret; + fence syncscope("device") acq_rel + ret void +} + + +define void @fence_seq_cst_sys() { +; SM30-LABEL: fence_seq_cst_sys( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.sys; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_seq_cst_sys( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.sc.sys; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_seq_cst_sys( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.sc.sys; +; SM90-NEXT: ret; + fence syncscope("") seq_cst + ret void +} + + +define void @fence_seq_cst_cta() { +; SM30-LABEL: fence_seq_cst_cta( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.cta; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_seq_cst_cta( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.sc.cta; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_seq_cst_cta( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.sc.cta; +; SM90-NEXT: ret; + fence syncscope("block") seq_cst + ret void +} + + +define void @fence_seq_cst_gpu() { +; SM30-LABEL: fence_seq_cst_gpu( +; SM30: { +; SM30-EMPTY: +; SM30-EMPTY: +; SM30-NEXT: // %bb.0: +; SM30-NEXT: membar.gl; +; SM30-NEXT: ret; +; +; SM70-LABEL: fence_seq_cst_gpu( +; SM70: { +; SM70-EMPTY: +; SM70-EMPTY: +; SM70-NEXT: // %bb.0: +; SM70-NEXT: fence.sc.gpu; +; SM70-NEXT: ret; +; +; SM90-LABEL: fence_seq_cst_gpu( +; SM90: { +; SM90-EMPTY: +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: fence.sc.gpu; +; SM90-NEXT: ret; + fence syncscope("device") seq_cst + ret void +} + diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll deleted file mode 100644 index dce39bf3e1e3e..0000000000000 --- a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} - -; CHECK-LABEL: fence_sc_cluster -define void @fence_sc_cluster() local_unnamed_addr { - ; CHECK: fence.sc.cluster - fence syncscope("cluster") seq_cst - ret void -} - -; CHECK-LABEL: fence_acq_rel_cluster -define void @fence_acq_rel_cluster() local_unnamed_addr { - ; CHECK: fence.acq_rel.cluster - fence syncscope("cluster") acq_rel - ret void -} - -; CHECK-LABEL: fence_release_cluster -define void @fence_release_cluster() local_unnamed_addr { - ; CHECK: fence.acq_rel.cluster - fence syncscope("cluster") release - ret void -} - -; CHECK-LABEL: fence_acquire_cluster -define void @fence_acquire_cluster() local_unnamed_addr { - ; CHECK: fence.acq_rel.cluster - fence syncscope("cluster") acquire - ret void -} diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll deleted file mode 100644 index e094ddf5775a6..0000000000000 --- a/llvm/test/CodeGen/NVPTX/fence.ll +++ /dev/null @@ -1,102 +0,0 @@ -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM60 -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70 -; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} - -; TODO: implement and test thread scope. - -; CHECK-LABEL: fence_sc_sys -define void @fence_sc_sys() local_unnamed_addr { - ; SM60: membar.sys - ; SM70: fence.sc.sys - fence seq_cst - ret void -} - -; CHECK-LABEL: fence_acq_rel_sys -define void @fence_acq_rel_sys() local_unnamed_addr { - ; SM60: membar.sys - ; SM70: fence.acq_rel.sys - fence acq_rel - ret void -} - -; CHECK-LABEL: fence_release_sys -define void @fence_release_sys() local_unnamed_addr { - ; SM60: membar.sys - ; SM70: fence.acq_rel.sys - fence release - ret void -} - -; CHECK-LABEL: fence_acquire_sys -define void @fence_acquire_sys() local_unnamed_addr { - ; SM60: membar.sys - ; SM70: fence.acq_rel.sys - fence acquire - ret void -} - -; CHECK-LABEL: fence_sc_gpu -define void @fence_sc_gpu() local_unnamed_addr { - ; SM60: membar.gl - ; SM70: fence.sc.gpu - fence syncscope("device") seq_cst - ret void -} - -; CHECK-LABEL: fence_acq_rel_gpu -define void @fence_acq_rel_gpu() local_unnamed_addr { - ; SM60: membar.gl - ; SM70: fence.acq_rel.gpu - fence syncscope("device") acq_rel - ret void -} - -; CHECK-LABEL: fence_release_gpu -define void @fence_release_gpu() local_unnamed_addr { - ; SM60: membar.gl - ; SM70: fence.acq_rel.gpu - fence syncscope("device") release - ret void -} - -; CHECK-LABEL: fence_acquire_gpu -define void @fence_acquire_gpu() local_unnamed_addr { - ; SM60: membar.gl - ; SM70: fence.acq_rel.gpu - fence syncscope("device") acquire - ret void -} - -; CHECK-LABEL: fence_sc_cta -define void @fence_sc_cta() local_unnamed_addr { - ; SM60: membar.cta - ; SM70: fence.sc.cta - fence syncscope("block") seq_cst - ret void -} - -; CHECK-LABEL: fence_acq_rel_cta -define void @fence_acq_rel_cta() local_unnamed_addr { - ; SM60: membar.cta - ; SM70: fence.acq_rel.cta - fence syncscope("block") acq_rel - ret void -} - -; CHECK-LABEL: fence_release_cta -define void @fence_release_cta() local_unnamed_addr { - ; SM60: membar.cta - ; SM70: fence.acq_rel.cta - fence syncscope("block") release - ret void -} - -; CHECK-LABEL: fence_acquire_cta -define void @fence_acquire_cta() local_unnamed_addr { - ; SM60: membar.cta - ; SM70: fence.acq_rel.cta - fence syncscope("block") acquire - ret void -} \ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/fence.py b/llvm/test/CodeGen/NVPTX/fence.py new file mode 100644 index 0000000000000..b9f9d294e6fe8 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence.py @@ -0,0 +1,56 @@ +# For manual usage, not as a part of lit tests. Used for generating the following tests: +# fence-sm30.ll, fence-sm70.ll, fence-sm90.ll + +from string import Template +from itertools import product + +fence_func = Template( + """ +define void @fence_${ordering}_${ptx_scope}() { + fence syncscope(\"${llvm_scope}\") ${ordering} + ret void +} +""" +) + +run_statement = Template( + """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm} +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify %}""" +) + +# (sm, ptx) +TESTS = [(30, 50), (70, 60), (90, 87)] + +LLVM_SCOPES_NO_CLUSTER = ["", "block", "device"] + +SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"} + +ORDERINGS = ["acquire", "release", "acq_rel", "seq_cst"] + +if __name__ == "__main__": + # non-cluster orderings are supported on SM30, SM70 and SM90 + with open("fence-nocluster.ll", "w") as fp: + for sm, ptx in TESTS: + print(run_statement.substitute(sm=sm, ptx=ptx), file=fp) + for ordering, llvm_scope in product(ORDERINGS, LLVM_SCOPES_NO_CLUSTER): + print( + fence_func.substitute( + llvm_scope=llvm_scope, + ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope], + ordering=ordering, + ), + file=fp, + ) + + # cluster ordering only supported on SM90 + with open("fence-cluster.ll", "w") as fp: + print(run_statement.substitute(sm=90, ptx=87), file=fp) + for ordering in ORDERINGS: + print( + fence_func.substitute( + llvm_scope="cluster", + ptx_scope=SCOPE_LLVM_TO_PTX["cluster"], + ordering=ordering, + ), + file=fp, + ) diff --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg index e3f06d1a720e3..54a6c338bdf85 100644 --- a/llvm/test/CodeGen/NVPTX/lit.local.cfg +++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg @@ -1,3 +1,4 @@ if not "NVPTX" in config.root.targets: config.unsupported = True config.suffixes.add(".py") +config.excludes = ["fence.py"]