Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions clang/lib/Basic/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,

if (TargetPointerWidth == 32)
resetDataLayout(
"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
"e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
else if (Opts.NVPTXUseShortPointers)
resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
"16-v32:32-n16:32:64");
resetDataLayout(
"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:"
"16-v32:32-n16:32:64");
else
resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");

Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGen/target-data.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@

// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX
// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"

// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX64
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/builtins-sm90.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa(ptr, u);
// CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
// CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
out[i++] = __nvvm_getctarank(ptr);
Expand Down
33 changes: 30 additions & 3 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ The NVPTX back-end uses the following address space mapping:
3 Shared
4 Constant
5 Local
7 Shared Cluster
============= ======================

Every global variable and pointer type is assigned to one of these address
Expand Down Expand Up @@ -306,6 +307,32 @@ If the given pointer in the generic address space refers to memory which falls
within the state space of the intrinsic (and therefore could be safely address
space casted to this space), 1 is returned, otherwise 0 is returned.

'``llvm.nvvm.mapa.*``' Intrinsics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare ptr @llvm.nvvm.mapa(ptr %p, i32 %rank)
declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p, i32 %rank)

Overview:
"""""""""

The '``llvm.nvvm.mapa.*``' intrinsics map a shared memory pointer ``p`` of another CTA with ``%rank`` to the current CTA.
The ``llvm.nvvm.mapa`` form expects a generic pointer to shared memory and returns a generic pointer to shared cluster memory.
The ``llvm.nvvm.mapa.shared.cluster`` form expects a pointer to shared memory and returns a pointer to shared cluster memory.
They corresponds directly to the ``mapa`` and ``mapa.shared.cluster`` PTX instructions.

Semantics:
""""""""""

If the given pointer in the generic address space refers to memory which falls
within the state space of the intrinsic (and therefore could be safely address
space casted to this space), 1 is returned, otherwise 0 is returned.

Arithmetic Intrinsics
---------------------

Expand Down Expand Up @@ -552,7 +579,7 @@ Syntax:

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)

Overview:
"""""""""
Expand Down Expand Up @@ -616,7 +643,7 @@ Syntax:

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)

Overview:
"""""""""
Expand Down Expand Up @@ -771,7 +798,7 @@ Syntax:

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
Expand Down
53 changes: 27 additions & 26 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,11 @@
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap

def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr

//
// MISC
Expand Down Expand Up @@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
list<LLVMType> ArgsTy = !listconcat(
[llvm_shared_ptr_ty, // dst_smem_ptr
llvm_shared_ptr_ty, // mbarrier_smem_ptr
llvm_ptr_ty], // tensormap_ptr
TensorDimsTy, // actual tensor dims
Im2ColOffsetsTy, // im2col offsets
[llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty] // Flag for cache_hint
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
llvm_shared_ptr_ty, // mbarrier_smem_ptr
llvm_ptr_ty], // tensormap_ptr
TensorDimsTy, // actual tensor dims
Im2ColOffsetsTy, // im2col offsets
[llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty] // Flag for cache_hint
);

int TempFlagsStartIdx = !add(dim, 5);
Expand Down Expand Up @@ -5134,7 +5135,7 @@ def int_nvvm_mapa
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa">;
def int_nvvm_mapa_shared_cluster
: DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
: DefaultAttrsIntrinsic<[llvm_shared_cluster_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa.shared.cluster">;
def int_nvvm_getctarank
Expand Down Expand Up @@ -5234,14 +5235,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
// From Global to Shared Cluster
def int_nvvm_cp_async_bulk_global_to_shared_cluster
: DefaultAttrsIntrinsic<[],
[llvm_shared_ptr_ty, // dst_smem_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_global_ptr_ty, // src_gmem_ptr
llvm_i32_ty, // copy_size
llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty], // Flag for cache_hint
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_global_ptr_ty, // src_gmem_ptr
llvm_i32_ty, // copy_size
llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
Expand All @@ -5251,10 +5252,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
// From Shared CTA to Shared Cluster
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
: DefaultAttrsIntrinsic<[],
[llvm_shared_ptr_ty, // dst_smem_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_shared_ptr_ty, // src_smem_ptr
llvm_i32_ty], // copy_size
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_shared_ptr_ty, // src_smem_ptr
llvm_i32_ty], // copy_size
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/Support/NVPTXAddrSpace.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ enum AddressSpace : unsigned {
ADDRESS_SPACE_CONST = 4,
ADDRESS_SPACE_LOCAL = 5,
ADDRESS_SPACE_TENSOR = 6,
ADDRESS_SPACE_SHARED_CLUSTER = 7,

ADDRESS_SPACE_PARAM = 101,
};
Expand Down
83 changes: 83 additions & 0 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstdint>
Expand Down Expand Up @@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return false; // No other 'arm.*', 'aarch64.*'.
}

static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
StringRef Name) {
if (Name.consume_front("mapa.shared.cluster"))
if (F->getReturnType()->getPointerAddressSpace() ==
NVPTXAS::ADDRESS_SPACE_SHARED)
return Intrinsic::nvvm_mapa_shared_cluster;

if (Name.consume_front("cp.async.bulk.")) {
Intrinsic::ID ID =
StringSwitch<Intrinsic::ID>(Name)
.Case("global.to.shared.cluster",
Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
.Case("shared.cta.to.cluster",
Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
.Case("tensor.g2s.im2col.3d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
.Case("tensor.g2s.im2col.4d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
.Case("tensor.g2s.im2col.5d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
.Case("tensor.g2s.tile.1d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
.Case("tensor.g2s.tile.2d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
.Case("tensor.g2s.tile.3d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
.Case("tensor.g2s.tile.4d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
.Case("tensor.g2s.tile.5d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
.Default(Intrinsic::not_intrinsic);

if (ID != Intrinsic::not_intrinsic)
if (F->getArg(0)->getType()->getPointerAddressSpace() ==
NVPTXAS::ADDRESS_SPACE_SHARED)
return ID;
}

return Intrinsic::not_intrinsic;
}

static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
if (Name.consume_front("fma.rn."))
return StringSwitch<Intrinsic::ID>(Name)
Expand Down Expand Up @@ -1278,6 +1320,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
}

// Upgrade Distributed Shared Memory Intrinsics
Intrinsic::ID IID = shouldUpgradeNVPTXSharedClusterIntrinsic(F, Name);
if (IID != Intrinsic::not_intrinsic) {
rename(F);
NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
return true;
}

// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
//
Expand Down Expand Up @@ -4718,6 +4768,39 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
CI->eraseFromParent();
return;
}
case Intrinsic::nvvm_mapa_shared_cluster: {
// Create a new call with the correct address space.
NewCall =
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
Value *Res = NewCall;
Res = Builder.CreateAddrSpaceCast(
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
NewCall->takeName(CI);
CI->replaceAllUsesWith(Res);
CI->eraseFromParent();
return;
}
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
// Create a new call with the correct address space.
SmallVector<Value *, 4> Args(CI->args());
Args[0] = Builder.CreateAddrSpaceCast(
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));

NewCall = Builder.CreateCall(NewFn, Args);
NewCall->takeName(CI);
CI->replaceAllUsesWith(NewCall);
CI->eraseFromParent();
return;
}
case Intrinsic::riscv_sha256sig0:
case Intrinsic::riscv_sha256sig1:
case Intrinsic::riscv_sha256sum0:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
case NVPTX::AddressSpace::Global:
case NVPTX::AddressSpace::Const:
case NVPTX::AddressSpace::Shared:
case NVPTX::AddressSpace::SharedCluster:
case NVPTX::AddressSpace::Param:
case NVPTX::AddressSpace::Local:
O << "." << A;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTX.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
Shared = 3,
Const = 4,
Local = 5,
SharedCluster = 7,

// NVPTX Backend Private:
Param = 101
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) {
// TODO: cvta.param is not yet supported. We need to change aliasing
// rules once it is added.

// Distributed shared memory aliases with shared memory.
if (((AS1 == ADDRESS_SPACE_SHARED) &&
(AS2 == ADDRESS_SPACE_SHARED_CLUSTER)) ||
((AS1 == ADDRESS_SPACE_SHARED_CLUSTER) && (AS2 == ADDRESS_SPACE_SHARED)))
return AliasResult::MayAlias;

return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias);
}

Expand Down
17 changes: 16 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,8 @@ static std::optional<unsigned> convertAS(unsigned AS) {
return NVPTX::AddressSpace::Global;
case llvm::ADDRESS_SPACE_SHARED:
return NVPTX::AddressSpace::Shared;
case llvm::ADDRESS_SPACE_SHARED_CLUSTER:
return NVPTX::AddressSpace::SharedCluster;
case llvm::ADDRESS_SPACE_GENERIC:
return NVPTX::AddressSpace::Generic;
case llvm::ADDRESS_SPACE_PARAM:
Expand Down Expand Up @@ -658,7 +660,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
bool AddrGenericOrGlobalOrShared =
(CodeAddrSpace == NVPTX::AddressSpace::Generic ||
CodeAddrSpace == NVPTX::AddressSpace::Global ||
CodeAddrSpace == NVPTX::AddressSpace::Shared);
CodeAddrSpace == NVPTX::AddressSpace::Shared ||
CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
if (!AddrGenericOrGlobalOrShared)
return NVPTX::Ordering::NotAtomic;

Expand Down Expand Up @@ -979,6 +982,12 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
break;
case ADDRESS_SPACE_SHARED_CLUSTER:
if (!TM.is64Bit())
report_fatal_error(
"Shared cluster address space is only supported in 64-bit mode");
Opc = NVPTX::cvta_shared_cluster_64;
break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
break;
Expand All @@ -1004,6 +1013,12 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
break;
case ADDRESS_SPACE_SHARED_CLUSTER:
if (!TM.is64Bit())
report_fatal_error(
"Shared cluster address space is only supported in 64-bit mode");
Opc = NVPTX::cvta_to_shared_cluster_64;
break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
break;
Expand Down
Loading
Loading