diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 5931a77a85fec..08c8460045c6a 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, if (TargetPointerWidth == 32) resetDataLayout( - "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); + "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); else if (Opts.NVPTXUseShortPointers) - resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:" - "16-v32:32-n16:32:64"); + resetDataLayout( + "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:" + "16-v32:32-n16:32:64"); else resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index fe29aadb1dd53..9cb00e8ee73d3 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -160,7 +160,7 @@ // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX -// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" +// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" // RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX64 diff --git a/llvm/include/llvm/Support/NVPTXAddrSpace.h b/llvm/include/llvm/Support/NVPTXAddrSpace.h index 486a396621da1..04f74c34787cc 100644 --- a/llvm/include/llvm/Support/NVPTXAddrSpace.h +++ b/llvm/include/llvm/Support/NVPTXAddrSpace.h @@ -25,6 +25,7 @@ enum AddressSpace : unsigned { ADDRESS_SPACE_CONST = 4, ADDRESS_SPACE_LOCAL = 5, ADDRESS_SPACE_TENSOR = 6, + ADDRESS_SPACE_SHARED_CLUSTER = 7, ADDRESS_SPACE_PARAM = 101, }; diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 4e2e4c99df803..0b137250e4e59 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -285,6 +285,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, case NVPTX::AddressSpace::Global: case NVPTX::AddressSpace::Const: case NVPTX::AddressSpace::Shared: + case NVPTX::AddressSpace::SharedCluster: case NVPTX::AddressSpace::Param: case NVPTX::AddressSpace::Local: O << "." << A; diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 98e77ca80b8d5..cf21ad991ccdf 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType { Shared = 3, Const = 4, Local = 5, + SharedCluster = 7, // NVPTX Backend Private: Param = 101 diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp index b910ccab21bf3..a579783802aa2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp @@ -86,6 +86,12 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) { // TODO: cvta.param is not yet supported. We need to change aliasing // rules once it is added. + // Distributed shared memory aliases with shared memory. + if (((AS1 == ADDRESS_SPACE_SHARED) && + (AS2 == ADDRESS_SPACE_SHARED_CLUSTER)) || + ((AS1 == ADDRESS_SPACE_SHARED_CLUSTER) && (AS2 == ADDRESS_SPACE_SHARED))) + return AliasResult::MayAlias; + return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 486c7c815435a..032975ed663e9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -513,6 +513,8 @@ static std::optional convertAS(unsigned AS) { return NVPTX::AddressSpace::Global; case llvm::ADDRESS_SPACE_SHARED: return NVPTX::AddressSpace::Shared; + case llvm::ADDRESS_SPACE_SHARED_CLUSTER: + return NVPTX::AddressSpace::SharedCluster; case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::AddressSpace::Generic; case llvm::ADDRESS_SPACE_PARAM: @@ -658,7 +660,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { bool AddrGenericOrGlobalOrShared = (CodeAddrSpace == NVPTX::AddressSpace::Generic || CodeAddrSpace == NVPTX::AddressSpace::Global || - CodeAddrSpace == NVPTX::AddressSpace::Shared); + CodeAddrSpace == NVPTX::AddressSpace::Shared || + CodeAddrSpace == NVPTX::AddressSpace::SharedCluster); if (!AddrGenericOrGlobalOrShared) return NVPTX::Ordering::NotAtomic; @@ -979,6 +982,12 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { case ADDRESS_SPACE_SHARED: Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared; break; + case ADDRESS_SPACE_SHARED_CLUSTER: + if (!TM.is64Bit()) + report_fatal_error( + "Shared cluster address space is only supported in 64-bit mode"); + Opc = NVPTX::cvta_shared_cluster_64; + break; case ADDRESS_SPACE_CONST: Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const; break; @@ -1004,6 +1013,12 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { case ADDRESS_SPACE_SHARED: Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared; break; + case ADDRESS_SPACE_SHARED_CLUSTER: + if (!TM.is64Bit()) + report_fatal_error( + "Shared cluster address space is only supported in 64-bit mode"); + Opc = NVPTX::cvta_to_shared_cluster_64; + break; case ADDRESS_SPACE_CONST: Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const; break; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 49f4f30096f00..18baf1f338023 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3043,8 +3043,27 @@ SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op, unsigned SrcAS = N->getSrcAddressSpace(); unsigned DestAS = N->getDestAddressSpace(); if (SrcAS != llvm::ADDRESS_SPACE_GENERIC && - DestAS != llvm::ADDRESS_SPACE_GENERIC) + DestAS != llvm::ADDRESS_SPACE_GENERIC) { + // Shared and SharedCluster can be converted to each other through generic + // space + if ((SrcAS == llvm::ADDRESS_SPACE_SHARED && + DestAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER) || + (SrcAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER && + DestAS == llvm::ADDRESS_SPACE_SHARED)) { + SDLoc DL(Op.getNode()); + const MVT GenerictVT = + getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_GENERIC); + SDValue GenericConversion = DAG.getAddrSpaceCast( + DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC); + SDValue SharedClusterConversion = + DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion, + ADDRESS_SPACE_GENERIC, DestAS); + return SharedClusterConversion; + } + return DAG.getUNDEF(Op.getValueType()); + } + return Op; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index ee6380a8a89c4..043da14bcb236 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -137,6 +137,7 @@ def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; def hasVote : Predicate<"Subtarget->hasVote()">; def hasDouble : Predicate<"Subtarget->hasDouble()">; +def hasClusters : Predicate<"Subtarget->hasClusters()">; def hasLDG : Predicate<"Subtarget->hasLDG()">; def hasLDU : Predicate<"Subtarget->hasLDU()">; def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 4ba3e6f06bb5f..a6595e512dbae 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -33,6 +33,9 @@ def AS_match { code shared = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED); }]; + code shared_cluster = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED_CLUSTER); + }]; code global = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); }]; @@ -2039,10 +2042,11 @@ class ATOMIC_GLOBAL_CHK : PatFrag; class ATOMIC_SHARED_CHK : PatFrag; +class ATOMIC_SHARED_CLUSTER_CHK + : PatFrag; class ATOMIC_GENERIC_CHK : PatFrag; - multiclass F_ATOMIC_2 preds> { defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;"; @@ -2094,6 +2098,7 @@ multiclass F_ATOMIC_2_AS, preds>; defm _S : F_ATOMIC_2, preds>; + defm _S_C : F_ATOMIC_2, !listconcat([hasClusters], preds)>; defm _GEN : F_ATOMIC_2, preds>; } @@ -2101,6 +2106,7 @@ multiclass F_ATOMIC_3_AS, preds>; defm _S : F_ATOMIC_3, preds>; + defm _S_C : F_ATOMIC_3, !listconcat([hasClusters], preds)>; defm _GEN : F_ATOMIC_3, preds>; } @@ -2381,18 +2387,22 @@ def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>; def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>; -multiclass NG_TO_G { - def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "cvta." # Str # ".u32 \t$result, $src;", []>; - def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "cvta." # Str # ".u64 \t$result, $src;", []>; +multiclass NG_TO_G Preds = []> { + if Supports32 then + def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + "cvta." # Str # ".u32 \t$result, $src;", []>, Requires; + + def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + "cvta." # Str # ".u64 \t$result, $src;", []>, Requires; } -multiclass G_TO_NG { - def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "cvta.to." # Str # ".u32 \t$result, $src;", []>; - def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "cvta.to." # Str # ".u64 \t$result, $src;", []>; +multiclass G_TO_NG Preds = []> { + if Supports32 then + def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), + "cvta.to." # Str # ".u32 \t$result, $src;", []>, Requires; + + def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), + "cvta.to." # Str # ".u64 \t$result, $src;", []>, Requires; } foreach space = ["local", "shared", "global", "const", "param"] in { @@ -2400,6 +2410,9 @@ foreach space = ["local", "shared", "global", "const", "param"] in { defm cvta_to_#space : G_TO_NG; } +defm cvta_shared_cluster : NG_TO_G<"shared::cluster", false, [hasClusters]>; +defm cvta_to_shared_cluster : G_TO_NG<"shared::cluster", false, [hasClusters]>; + def : Pat<(int_nvvm_ptr_param_to_gen i32:$src), (cvta_param $src)>; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index a4c3b43aec9f2..1a7b20390a562 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -117,13 +117,15 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { std::string Ret = "e"; - if (!is64Bit) - Ret += "-p:32:32"; - else if (UseShortPointers) - Ret += "-p3:32:32-p4:32:32-p5:32:32"; - // Tensor Memory (addrspace:6) is always 32-bits. - Ret += "-p6:32:32"; + // Distributed Shared Memory (addrspace:7) follows shared memory + // (addrspace:3). + if (!is64Bit) + Ret += "-p:32:32-p6:32:32-p7:32:32"; + else if (UseShortPointers) { + Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32"; + } else + Ret += "-p6:32:32"; Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; @@ -280,8 +282,10 @@ NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const { case Intrinsic::nvvm_isspacep_local: return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL); case Intrinsic::nvvm_isspacep_shared: - case Intrinsic::nvvm_isspacep_shared_cluster: return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); + case Intrinsic::nvvm_isspacep_shared_cluster: + return std::make_pair(II->getArgOperand(0), + llvm::ADDRESS_SPACE_SHARED_CLUSTER); default: break; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index b5f7d90cd29d9..e0c946290adf2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -424,12 +424,13 @@ static std::optional evaluateIsSpace(Intrinsic::ID IID, unsigned AS) { case Intrinsic::nvvm_isspacep_local: return AS == NVPTXAS::ADDRESS_SPACE_LOCAL; case Intrinsic::nvvm_isspacep_shared: + // If shared cluster this can't be evaluated at compile time. + if (AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER) + return std::nullopt; return AS == NVPTXAS::ADDRESS_SPACE_SHARED; case Intrinsic::nvvm_isspacep_shared_cluster: - // We can't tell shared from shared_cluster at compile time from AS alone, - // but it can't be either is AS is not shared. - return AS == NVPTXAS::ADDRESS_SPACE_SHARED ? std::nullopt - : std::optional{false}; + return AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER || + AS == NVPTXAS::ADDRESS_SPACE_SHARED; case Intrinsic::nvvm_isspacep_const: return AS == NVPTXAS::ADDRESS_SPACE_CONST; default: diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 70bf02035fd48..a1b4a0e5e7471 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -168,6 +168,8 @@ inline std::string AddressSpaceToString(AddressSpace A) { return "const"; case AddressSpace::Shared: return "shared"; + case AddressSpace::SharedCluster: + return "shared::cluster"; case AddressSpace::Param: return "param"; case AddressSpace::Local: diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll new file mode 100644 index 0000000000000..afd0a7fded64e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -check-prefixes=NOPTRCONV +; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | FileCheck %s -check-prefixes=PTRCONV +; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify %} +; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | %ptxas-verify %} + +; ALL-LABEL: conv_shared_cluster_to_generic +define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) { +; CLS32: cvta.shared::cluster.u32 +; NOPTRCONV-LABEL: conv_shared_cluster_to_generic( +; NOPTRCONV: { +; NOPTRCONV-NEXT: .reg .b32 %r<2>; +; NOPTRCONV-NEXT: .reg .b64 %rd<3>; +; NOPTRCONV-EMPTY: +; NOPTRCONV-NEXT: // %bb.0: +; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_generic_param_0]; +; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; +; NOPTRCONV-NEXT: ld.u32 %r1, [%rd2]; +; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; +; NOPTRCONV-NEXT: ret; +; +; PTRCONV-LABEL: conv_shared_cluster_to_generic( +; PTRCONV: { +; PTRCONV-NEXT: .reg .b32 %r<3>; +; PTRCONV-NEXT: .reg .b64 %rd<3>; +; PTRCONV-EMPTY: +; PTRCONV-NEXT: // %bb.0: +; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_generic_param_0]; +; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1; +; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; +; PTRCONV-NEXT: ld.u32 %r2, [%rd2]; +; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2; +; PTRCONV-NEXT: ret; + %genptr = addrspacecast ptr addrspace(7) %ptr to ptr + %val = load i32, ptr %genptr + ret i32 %val +} + +; ALL-LABEL: conv_generic_to_shared_cluster +define i32 @conv_generic_to_shared_cluster(ptr %ptr) { +; CLS32: cvta.to.shared::cluster.u32 +; NOPTRCONV-LABEL: conv_generic_to_shared_cluster( +; NOPTRCONV: { +; NOPTRCONV-NEXT: .reg .b32 %r<2>; +; NOPTRCONV-NEXT: .reg .b64 %rd<3>; +; NOPTRCONV-EMPTY: +; NOPTRCONV-NEXT: // %bb.0: +; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0]; +; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1; +; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd2]; +; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; +; NOPTRCONV-NEXT: ret; +; +; PTRCONV-LABEL: conv_generic_to_shared_cluster( +; PTRCONV: { +; PTRCONV-NEXT: .reg .b32 %r<3>; +; PTRCONV-NEXT: .reg .b64 %rd<3>; +; PTRCONV-EMPTY: +; PTRCONV-NEXT: // %bb.0: +; PTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0]; +; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1; +; PTRCONV-NEXT: cvt.u32.u64 %r1, %rd2; +; PTRCONV-NEXT: ld.shared::cluster.u32 %r2, [%r1]; +; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2; +; PTRCONV-NEXT: ret; + %specptr = addrspacecast ptr %ptr to ptr addrspace(7) + %val = load i32, ptr addrspace(7) %specptr + ret i32 %val +} + +; ALL-LABEL: conv_shared_to_shared_cluster +define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) { +; NOPTRCONV-LABEL: conv_shared_to_shared_cluster( +; NOPTRCONV: { +; NOPTRCONV-NEXT: .reg .b32 %r<2>; +; NOPTRCONV-NEXT: .reg .b64 %rd<4>; +; NOPTRCONV-EMPTY: +; NOPTRCONV-NEXT: // %bb.0: +; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_to_shared_cluster_param_0]; +; NOPTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1; +; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2; +; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd3]; +; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; +; NOPTRCONV-NEXT: ret; +; +; PTRCONV-LABEL: conv_shared_to_shared_cluster( +; PTRCONV: { +; PTRCONV-NEXT: .reg .b32 %r<4>; +; PTRCONV-NEXT: .reg .b64 %rd<4>; +; PTRCONV-EMPTY: +; PTRCONV-NEXT: // %bb.0: +; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_to_shared_cluster_param_0]; +; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1; +; PTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1; +; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2; +; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3; +; PTRCONV-NEXT: ld.shared::cluster.u32 %r3, [%r2]; +; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3; +; PTRCONV-NEXT: ret; + %specptr = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(7) + %val = load i32, ptr addrspace(7) %specptr + ret i32 %val +} + +; ALL-LABEL: conv_shared_cluster_to_shared +define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) { +; NOPTRCONV-LABEL: conv_shared_cluster_to_shared( +; NOPTRCONV: { +; NOPTRCONV-NEXT: .reg .b32 %r<2>; +; NOPTRCONV-NEXT: .reg .b64 %rd<4>; +; NOPTRCONV-EMPTY: +; NOPTRCONV-NEXT: // %bb.0: +; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_shared_param_0]; +; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; +; NOPTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2; +; NOPTRCONV-NEXT: ld.shared.u32 %r1, [%rd3]; +; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; +; NOPTRCONV-NEXT: ret; +; +; PTRCONV-LABEL: conv_shared_cluster_to_shared( +; PTRCONV: { +; PTRCONV-NEXT: .reg .b32 %r<4>; +; PTRCONV-NEXT: .reg .b64 %rd<4>; +; PTRCONV-EMPTY: +; PTRCONV-NEXT: // %bb.0: +; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_shared_param_0]; +; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1; +; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; +; PTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2; +; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3; +; PTRCONV-NEXT: ld.shared.u32 %r3, [%r2]; +; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3; +; PTRCONV-NEXT: ret; + %specptr = addrspacecast ptr addrspace(7) %ptr to ptr addrspace(3) + %val = load i32, ptr addrspace(3) %specptr + ret i32 %val +} diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll new file mode 100644 index 0000000000000..8b6c554aeb9f2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -0,0 +1,271 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s +; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +; Floating point atomic operations tests +define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { +; CHECK-LABEL: test_distributed_shared_cluster_float_atomic( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .f32 %f<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0]; +; CHECK-NEXT: mov.b16 %rs1, 0x3C00; +; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1; +; CHECK-NEXT: mov.b16 %rs3, 0x3F80; +; CHECK-NEXT: atom.shared::cluster.add.noftz.bf16 %rs4, [%rd1], %rs3; +; CHECK-NEXT: atom.shared::cluster.add.f32 %f1, [%rd1], 0f3F800000; +; CHECK-NEXT: atom.shared::cluster.add.f64 %fd1, [%rd1], 0d3FF0000000000000; +; CHECK-NEXT: ret; +entry: + ; Floating point atomic operations + %0 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, half 1.000000e+00 seq_cst + %1 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, bfloat 1.000000e+00 seq_cst + %2 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, float 1.000000e+00 seq_cst + %3 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, double 1.000000e+00 seq_cst + + ret void +} + +; Integer atomic operations tests +define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { +; CHECK-LABEL: test_distributed_shared_cluster_int_atomic( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0]; +; CHECK-NEXT: atom.shared::cluster.add.u32 %r1, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.add.u64 %rd2, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.exch.b32 %r2, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.exch.b64 %rd3, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.min.s32 %r3, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.min.s64 %rd4, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.min.u32 %r4, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.min.u64 %rd5, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.max.s32 %r5, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.max.s64 %rd6, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.max.u32 %r6, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.max.u64 %rd7, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.inc.u32 %r7, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.dec.u32 %r8, [%rd1], 1; +; CHECK-NEXT: ret; +entry: + ; Integer add operations + %0 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %1 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + + ; Exchange operations + %2 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %3 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + + ; Min operations (signed and unsigned) + %4 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %5 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + %6 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %7 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + + ; Max operations (signed and unsigned) + %8 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %9 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + %10 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %11 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + + ; Inc/Dec operations (32-bit only) + %12 = atomicrmw uinc_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %13 = atomicrmw udec_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + + ret void +} + +; Bitwise atomic operations tests +define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { +; CHECK-LABEL: test_distributed_shared_cluster_bitwise_atomic( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0]; +; CHECK-NEXT: atom.shared::cluster.and.b32 %r1, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.and.b64 %rd2, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.or.b32 %r2, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.or.b64 %rd3, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.xor.b32 %r3, [%rd1], 1; +; CHECK-NEXT: atom.shared::cluster.xor.b64 %rd4, [%rd1], 1; +; CHECK-NEXT: ret; +entry: + ; Bitwise operations + %0 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %1 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + %2 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %3 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + %4 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i32 1 monotonic + %5 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i64 1 monotonic + + ret void +} + +; Compare-exchange operations tests +define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { +; CHECK-LABEL: test_distributed_shared_cluster_cmpxchg( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<11>; +; CHECK-NEXT: .reg .b32 %r<53>; +; CHECK-NEXT: .reg .b64 %rd<12>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; +; CHECK-NEXT: and.b64 %rd1, %rd2, -4; +; CHECK-NEXT: cvt.u32.u64 %r33, %rd2; +; CHECK-NEXT: and.b32 %r34, %r33, 3; +; CHECK-NEXT: shl.b32 %r1, %r34, 3; +; CHECK-NEXT: mov.b32 %r35, 65535; +; CHECK-NEXT: shl.b32 %r36, %r35, %r1; +; CHECK-NEXT: not.b32 %r2, %r36; +; CHECK-NEXT: mov.b32 %r37, 1; +; CHECK-NEXT: shl.b32 %r3, %r37, %r1; +; CHECK-NEXT: ld.shared::cluster.u32 %r38, [%rd1]; +; CHECK-NEXT: and.b32 %r48, %r38, %r2; +; CHECK-NEXT: $L__BB3_1: // %partword.cmpxchg.loop33 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r39, %r48, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; +; CHECK-NEXT: setp.eq.s32 %p1, %r6, %r39; +; CHECK-NEXT: @%p1 bra $L__BB3_3; +; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: and.b32 %r7, %r6, %r2; +; CHECK-NEXT: setp.ne.s32 %p2, %r48, %r7; +; CHECK-NEXT: mov.b32 %r48, %r7; +; CHECK-NEXT: @%p2 bra $L__BB3_1; +; CHECK-NEXT: $L__BB3_3: // %partword.cmpxchg.end31 +; CHECK-NEXT: ld.shared::cluster.u32 %r40, [%rd1]; +; CHECK-NEXT: and.b32 %r49, %r40, %r2; +; CHECK-NEXT: $L__BB3_4: // %partword.cmpxchg.loop23 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r41, %r49, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; +; CHECK-NEXT: setp.eq.s32 %p3, %r10, %r41; +; CHECK-NEXT: @%p3 bra $L__BB3_6; +; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 +; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1 +; CHECK-NEXT: and.b32 %r11, %r10, %r2; +; CHECK-NEXT: setp.ne.s32 %p4, %r49, %r11; +; CHECK-NEXT: mov.b32 %r49, %r11; +; CHECK-NEXT: @%p4 bra $L__BB3_4; +; CHECK-NEXT: $L__BB3_6: // %partword.cmpxchg.end21 +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: ld.shared::cluster.u32 %r42, [%rd1]; +; CHECK-NEXT: and.b32 %r50, %r42, %r2; +; CHECK-NEXT: $L__BB3_7: // %partword.cmpxchg.loop13 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r43, %r50, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; +; CHECK-NEXT: setp.eq.s32 %p5, %r14, %r43; +; CHECK-NEXT: @%p5 bra $L__BB3_9; +; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 +; CHECK-NEXT: // in Loop: Header=BB3_7 Depth=1 +; CHECK-NEXT: and.b32 %r15, %r14, %r2; +; CHECK-NEXT: setp.ne.s32 %p6, %r50, %r15; +; CHECK-NEXT: mov.b32 %r50, %r15; +; CHECK-NEXT: @%p6 bra $L__BB3_7; +; CHECK-NEXT: $L__BB3_9: // %partword.cmpxchg.end11 +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: ld.shared::cluster.u32 %r44, [%rd1]; +; CHECK-NEXT: and.b32 %r51, %r44, %r2; +; CHECK-NEXT: $L__BB3_10: // %partword.cmpxchg.loop3 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r45, %r51, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; +; CHECK-NEXT: setp.eq.s32 %p7, %r18, %r45; +; CHECK-NEXT: @%p7 bra $L__BB3_12; +; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 +; CHECK-NEXT: // in Loop: Header=BB3_10 Depth=1 +; CHECK-NEXT: and.b32 %r19, %r18, %r2; +; CHECK-NEXT: setp.ne.s32 %p8, %r51, %r19; +; CHECK-NEXT: mov.b32 %r51, %r19; +; CHECK-NEXT: @%p8 bra $L__BB3_10; +; CHECK-NEXT: $L__BB3_12: // %partword.cmpxchg.end1 +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: fence.sc.sys; +; CHECK-NEXT: ld.shared::cluster.u32 %r46, [%rd1]; +; CHECK-NEXT: and.b32 %r52, %r46, %r2; +; CHECK-NEXT: $L__BB3_13: // %partword.cmpxchg.loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: or.b32 %r47, %r52, %r3; +; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; +; CHECK-NEXT: setp.eq.s32 %p9, %r22, %r47; +; CHECK-NEXT: @%p9 bra $L__BB3_15; +; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure +; CHECK-NEXT: // in Loop: Header=BB3_13 Depth=1 +; CHECK-NEXT: and.b32 %r23, %r22, %r2; +; CHECK-NEXT: setp.ne.s32 %p10, %r52, %r23; +; CHECK-NEXT: mov.b32 %r52, %r23; +; CHECK-NEXT: @%p10 bra $L__BB3_13; +; CHECK-NEXT: $L__BB3_15: // %partword.cmpxchg.end +; CHECK-NEXT: fence.acq_rel.sys; +; CHECK-NEXT: ret; +entry: + ; Compare-exchange operation - all memory ordering combinations for 32-bit + %0 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 monotonic monotonic + %1 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire monotonic + %2 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire acquire + %3 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 release monotonic + %4 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel monotonic + %5 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel acquire + %6 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst monotonic + %7 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst acquire + %8 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst seq_cst + + ; Compare-exchange operation - all memory ordering combinations for 64-bit + %9 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 monotonic monotonic + %10 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire monotonic + %11 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire acquire + %12 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 release monotonic + %13 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel monotonic + %14 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel acquire + %15 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst monotonic + %16 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst acquire + %17 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst seq_cst + + ; Compare-exchange operation - 16-bit + %18 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 monotonic monotonic + %19 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acquire acquire + %20 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 release monotonic + %21 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acq_rel acquire + %22 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 seq_cst seq_cst + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/nvptx-aa.ll b/llvm/test/CodeGen/NVPTX/nvptx-aa.ll index 074e741dc3e94..924220326c341 100644 --- a/llvm/test/CodeGen/NVPTX/nvptx-aa.ll +++ b/llvm/test/CodeGen/NVPTX/nvptx-aa.ll @@ -18,25 +18,33 @@ target triple = "nvptx64-nvidia-cuda" ; CHECK-ALIAS: NoAlias: i8 addrspace(1)* %global, i8 addrspace(5)* %local ; CHECK-ALIAS: NoAlias: i8 addrspace(5)* %local, i8 addrspace(3)* %shared ; CHECK-ALIAS: NoAlias: i8 addrspace(4)* %const, i8 addrspace(5)* %local +; CHECK-ALIAS: MayAlias: i8* %gen, i8 addrspace(7)* %shared_cluster +; CHECK-ALIAS: NoAlias: i8 addrspace(1)* %global, i8 addrspace(7)* %shared_cluster +; CHECK-ALIAS: MayAlias: i8 addrspace(3)* %shared, i8 addrspace(7)* %shared_cluster +; CHECK-ALIAS: NoAlias: i8 addrspace(4)* %const, i8 addrspace(7)* %shared_cluster +; CHECK-ALIAS: NoAlias: i8 addrspace(5)* %local, i8 addrspace(7)* %shared_cluster ; CHECK-ALIAS: MayAlias: i8* %gen, i8 addrspace(101)* %param ; CHECK-ALIAS: NoAlias: i8 addrspace(1)* %global, i8 addrspace(101)* %param ; CHECK-ALIAS: NoAlias: i8 addrspace(101)* %param, i8 addrspace(3)* %shared ; CHECK-ALIAS: NoAlias: i8 addrspace(4)* %const, i8 addrspace(101)* %param ; CHECK-ALIAS: NoAlias: i8 addrspace(5)* %local, i8 addrspace(101)* %param +; CHECK-ALIAS: NoAlias: i8 addrspace(101)* %param, i8 addrspace(7)* %shared_cluster -define i8 @test_alias(ptr %gen, ptr addrspace(1) %global, ptr addrspace(3) %shared, ptr addrspace(4) %const, ptr addrspace(5) %local) { +define i8 @test_alias(ptr %gen, ptr addrspace(1) %global, ptr addrspace(3) %shared, ptr addrspace(4) %const, ptr addrspace(5) %local, ptr addrspace(7) %shared_cluster) { %param = addrspacecast ptr %gen to ptr addrspace(101) %v1 = load i8, ptr %gen %v2 = load i8, ptr addrspace(1) %global %v3 = load i8, ptr addrspace(3) %shared %v4 = load i8, ptr addrspace(4) %const %v5 = load i8, ptr addrspace(5) %local - %v6 = load i8, ptr addrspace(101) %param + %v6 = load i8, ptr addrspace(7) %shared_cluster + %v7 = load i8, ptr addrspace(101) %param %res1 = add i8 %v1, %v2 %res2 = add i8 %res1, %v3 %res3 = add i8 %res2, %v4 %res4 = add i8 %res3, %v5 %res5 = add i8 %res4, %v6 + %res6 = add i8 %res4, %v7 ret i8 %res5 } diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll index 348fa688770df..d05e106d81342 100644 --- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll @@ -114,24 +114,21 @@ entry: ret i1 %val } -define i1 @test_isspacep_cluster_shared_unsure(ptr addrspace(3) %addr) { -; CHECK-LABEL: define i1 @test_isspacep_cluster_shared_unsure( -; CHECK-SAME: ptr addrspace(3) [[ADDR:%.*]]) { +define i1 @test_isspacep_shared_cluster_true(ptr addrspace(7) %addr) { +; CHECK-LABEL: define i1 @test_isspacep_shared_cluster_true( +; CHECK-SAME: ptr addrspace(7) [[ADDR:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr i8, ptr addrspace(3) [[ADDR]], i32 10 -; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) [[ADDR1]] to ptr -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr [[TMP0]]) -; CHECK-NEXT: ret i1 [[VAL]] +; CHECK-NEXT: ret i1 true ; entry: - %addr0 = addrspacecast ptr addrspace(3) %addr to ptr + %addr0 = addrspacecast ptr addrspace(7) %addr to ptr %addr1 = getelementptr i8, ptr %addr0, i32 10 %val = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %addr1) ret i1 %val } -define i1 @test_isspacep_cluster_shared_false(ptr addrspace(1) %addr) { -; CHECK-LABEL: define i1 @test_isspacep_cluster_shared_false( +define i1 @test_isspacep_shared_cluster_false(ptr addrspace(1) %addr) { +; CHECK-LABEL: define i1 @test_isspacep_shared_cluster_false( ; CHECK-SAME: ptr addrspace(1) [[ADDR:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: ret i1 false @@ -142,3 +139,34 @@ entry: %val = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %addr1) ret i1 %val } + +; isspacep_shared_cluster returns true for shared +define i1 @test_isspacep_cluster_shared_shared(ptr addrspace(3) %addr) { +; CHECK-LABEL: define i1 @test_isspacep_cluster_shared_shared( +; CHECK-SAME: ptr addrspace(3) [[ADDR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: ret i1 true +; +entry: + %addr0 = addrspacecast ptr addrspace(3) %addr to ptr + %addr1 = getelementptr i8, ptr %addr0, i32 10 + %val = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %addr1) + ret i1 %val +} + +; shared cluster cannot be evaluated to shared at compile time +define i1 @test_isspacep_shared_shared_cluster(ptr addrspace(7) %addr) { +; CHECK-LABEL: define i1 @test_isspacep_shared_shared_cluster( +; CHECK-SAME: ptr addrspace(7) [[ADDR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ADDR2:%.*]] = getelementptr i8, ptr addrspace(7) [[ADDR]], i32 10 +; CHECK-NEXT: [[ADDR1:%.*]] = addrspacecast ptr addrspace(7) [[ADDR2]] to ptr +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.nvvm.isspacep.shared(ptr [[ADDR1]]) +; CHECK-NEXT: ret i1 [[VAL]] +; +entry: + %addr0 = addrspacecast ptr addrspace(7) %addr to ptr + %addr1 = getelementptr i8, ptr %addr0, i32 10 + %val = call i1 @llvm.nvvm.isspacep.shared(ptr %addr1) + ret i1 %val +}