Skip to content

Commit 4844c92

Browse files
committed
initial implementation
1 parent 2b002d6 commit 4844c92

File tree

23 files changed

+558
-112
lines changed

23 files changed

+558
-112
lines changed

clang/lib/Basic/Targets/NVPTX.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
7171

7272
if (TargetPointerWidth == 32)
7373
resetDataLayout(
74-
"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
74+
"e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
7575
else if (Opts.NVPTXUseShortPointers)
76-
resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
77-
"16-v32:32-n16:32:64");
76+
resetDataLayout(
77+
"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:"
78+
"16-v32:32-n16:32:64");
7879
else
7980
resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
8081

clang/test/CodeGen/target-data.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@
160160

161161
// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
162162
// RUN: FileCheck %s -check-prefix=NVPTX
163-
// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
163+
// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
164164

165165
// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
166166
// RUN: FileCheck %s -check-prefix=NVPTX64

clang/test/CodeGenCUDA/builtins-sm90.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
5050
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
5151
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
5252
out[i++] = (long) __nvvm_mapa(ptr, u);
53-
// CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
53+
// CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
5454
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
5555
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
5656
out[i++] = __nvvm_getctarank(ptr);

llvm/docs/NVPTXUsage.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@ Syntax:
552552

553553
.. code-block:: llvm
554554
555-
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
555+
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
556556
557557
Overview:
558558
"""""""""
@@ -616,7 +616,7 @@ Syntax:
616616

617617
.. code-block:: llvm
618618
619-
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
619+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
620620
621621
Overview:
622622
"""""""""
@@ -771,7 +771,7 @@ Syntax:
771771

772772
.. code-block:: llvm
773773
774-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
774+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
775775
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
776776
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
777777
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131131
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132132
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133133
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
134+
def llvm_dshared_ptr_ty : LLVMQualPointerType<7>; // (dshared)ptr
134135

135136
//
136137
// MISC
@@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
691692
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
692693
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
693694
list<LLVMType> ArgsTy = !listconcat(
694-
[llvm_shared_ptr_ty, // dst_smem_ptr
695-
llvm_shared_ptr_ty, // mbarrier_smem_ptr
696-
llvm_ptr_ty], // tensormap_ptr
697-
TensorDimsTy, // actual tensor dims
698-
Im2ColOffsetsTy, // im2col offsets
699-
[llvm_i16_ty, // cta_mask
700-
llvm_i64_ty, // cache_hint
701-
llvm_i1_ty, // Flag for cta_mask
702-
llvm_i1_ty] // Flag for cache_hint
695+
[llvm_dshared_ptr_ty, // dst_smem_ptr
696+
llvm_shared_ptr_ty, // mbarrier_smem_ptr
697+
llvm_ptr_ty], // tensormap_ptr
698+
TensorDimsTy, // actual tensor dims
699+
Im2ColOffsetsTy, // im2col offsets
700+
[llvm_i16_ty, // cta_mask
701+
llvm_i64_ty, // cache_hint
702+
llvm_i1_ty, // Flag for cta_mask
703+
llvm_i1_ty] // Flag for cache_hint
703704
);
704705

705706
int TempFlagsStartIdx = !add(dim, 5);
@@ -5118,7 +5119,7 @@ def int_nvvm_mapa
51185119
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51195120
"llvm.nvvm.mapa">;
51205121
def int_nvvm_mapa_shared_cluster
5121-
: DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
5122+
: DefaultAttrsIntrinsic<[llvm_dshared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
51225123
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51235124
"llvm.nvvm.mapa.shared.cluster">;
51245125
def int_nvvm_getctarank
@@ -5218,14 +5219,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
52185219
// From Global to Shared Cluster
52195220
def int_nvvm_cp_async_bulk_global_to_shared_cluster
52205221
: DefaultAttrsIntrinsic<[],
5221-
[llvm_shared_ptr_ty, // dst_smem_ptr
5222-
llvm_shared_ptr_ty, // mbarrier_ptr
5223-
llvm_global_ptr_ty, // src_gmem_ptr
5224-
llvm_i32_ty, // copy_size
5225-
llvm_i16_ty, // cta_mask
5226-
llvm_i64_ty, // cache_hint
5227-
llvm_i1_ty, // Flag for cta_mask
5228-
llvm_i1_ty], // Flag for cache_hint
5222+
[llvm_dshared_ptr_ty, // dst_dsmem_ptr
5223+
llvm_shared_ptr_ty, // mbarrier_ptr
5224+
llvm_global_ptr_ty, // src_gmem_ptr
5225+
llvm_i32_ty, // copy_size
5226+
llvm_i16_ty, // cta_mask
5227+
llvm_i64_ty, // cache_hint
5228+
llvm_i1_ty, // Flag for cta_mask
5229+
llvm_i1_ty], // Flag for cache_hint
52295230
[IntrConvergent, IntrArgMemOnly,
52305231
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52315232
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5235,10 +5236,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
52355236
// From Shared CTA to Shared Cluster
52365237
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
52375238
: DefaultAttrsIntrinsic<[],
5238-
[llvm_shared_ptr_ty, // dst_smem_ptr
5239-
llvm_shared_ptr_ty, // mbarrier_ptr
5240-
llvm_shared_ptr_ty, // src_smem_ptr
5241-
llvm_i32_ty], // copy_size
5239+
[llvm_dshared_ptr_ty, // dst_dsmem_ptr
5240+
llvm_shared_ptr_ty, // mbarrier_ptr
5241+
llvm_shared_ptr_ty, // src_smem_ptr
5242+
llvm_i32_ty], // copy_size
52425243
[IntrConvergent, IntrArgMemOnly,
52435244
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52445245
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,

llvm/include/llvm/Support/NVPTXAddrSpace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ enum AddressSpace : unsigned {
2525
ADDRESS_SPACE_CONST = 4,
2626
ADDRESS_SPACE_LOCAL = 5,
2727
ADDRESS_SPACE_TENSOR = 6,
28+
ADDRESS_SPACE_DSHARED = 7,
2829

2930
ADDRESS_SPACE_PARAM = 101,
3031
};

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "llvm/Support/AMDGPUAddrSpace.h"
4747
#include "llvm/Support/CommandLine.h"
4848
#include "llvm/Support/ErrorHandling.h"
49+
#include "llvm/Support/NVPTXAddrSpace.h"
4950
#include "llvm/Support/Regex.h"
5051
#include "llvm/TargetParser/Triple.h"
5152
#include <cstdint>
@@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
938939
return false; // No other 'arm.*', 'aarch64.*'.
939940
}
940941

942+
static Intrinsic::ID shouldUpgradeNVPTXDSharedIntrinsic(Function *F,
943+
StringRef Name) {
944+
if (Name.consume_front("mapa.shared.cluster"))
945+
if (F->getReturnType()->getPointerAddressSpace() ==
946+
NVPTXAS::ADDRESS_SPACE_SHARED)
947+
return Intrinsic::nvvm_mapa_shared_cluster;
948+
949+
if (Name.consume_front("cp.async.bulk.")) {
950+
Intrinsic::ID ID =
951+
StringSwitch<Intrinsic::ID>(Name)
952+
.Case("global.to.shared.cluster",
953+
Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
954+
.Case("shared.cta.to.cluster",
955+
Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
956+
.Case("tensor.g2s.im2col.3d",
957+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
958+
.Case("tensor.g2s.im2col.4d",
959+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
960+
.Case("tensor.g2s.im2col.5d",
961+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
962+
.Case("tensor.g2s.tile.1d",
963+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
964+
.Case("tensor.g2s.tile.2d",
965+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
966+
.Case("tensor.g2s.tile.3d",
967+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
968+
.Case("tensor.g2s.tile.4d",
969+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
970+
.Case("tensor.g2s.tile.5d",
971+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
972+
.Default(Intrinsic::not_intrinsic);
973+
974+
if (ID != Intrinsic::not_intrinsic)
975+
if (F->getArg(0)->getType()->getPointerAddressSpace() ==
976+
NVPTXAS::ADDRESS_SPACE_SHARED)
977+
return ID;
978+
}
979+
980+
return Intrinsic::not_intrinsic;
981+
}
982+
941983
static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
942984
if (Name.consume_front("fma.rn."))
943985
return StringSwitch<Intrinsic::ID>(Name)
@@ -1278,6 +1320,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
12781320
}
12791321
}
12801322

1323+
// Upgrade Distributed Shared Memory Intrinsics
1324+
Intrinsic::ID IID = shouldUpgradeNVPTXDSharedIntrinsic(F, Name);
1325+
if (IID != Intrinsic::not_intrinsic) {
1326+
rename(F);
1327+
NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
1328+
return true;
1329+
}
1330+
12811331
// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
12821332
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
12831333
//
@@ -4718,6 +4768,43 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47184768
CI->eraseFromParent();
47194769
return;
47204770
}
4771+
case Intrinsic::nvvm_mapa_shared_cluster: {
4772+
// Create a new call with the correct address space.
4773+
NewCall =
4774+
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
4775+
Value *Res = NewCall;
4776+
Res = Builder.CreateAddrSpaceCast(
4777+
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
4778+
Res = Builder.CreateAddrSpaceCast(
4779+
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
4780+
NewCall->takeName(CI);
4781+
CI->replaceAllUsesWith(Res);
4782+
CI->eraseFromParent();
4783+
return;
4784+
}
4785+
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
4786+
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
4787+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4788+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4789+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4790+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
4791+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
4792+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
4793+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
4794+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
4795+
4796+
SmallVector<Value *, 4> Args(CI->args());
4797+
Args[0] = Builder.CreateAddrSpaceCast(
4798+
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
4799+
Args[0] = Builder.CreateAddrSpaceCast(
4800+
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_DSHARED));
4801+
4802+
NewCall = Builder.CreateCall(NewFn, Args);
4803+
NewCall->takeName(CI);
4804+
CI->replaceAllUsesWith(NewCall);
4805+
CI->eraseFromParent();
4806+
return;
4807+
}
47214808
case Intrinsic::riscv_sha256sig0:
47224809
case Intrinsic::riscv_sha256sig1:
47234810
case Intrinsic::riscv_sha256sum0:

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
285285
case NVPTX::AddressSpace::Global:
286286
case NVPTX::AddressSpace::Const:
287287
case NVPTX::AddressSpace::Shared:
288+
case NVPTX::AddressSpace::Dshared:
288289
case NVPTX::AddressSpace::Param:
289290
case NVPTX::AddressSpace::Local:
290291
O << "." << A;

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
176176
Shared = 3,
177177
Const = 4,
178178
Local = 5,
179+
Dshared = 7,
179180

180181
// NVPTX Backend Private:
181182
Param = 101

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) {
8686
// TODO: cvta.param is not yet supported. We need to change aliasing
8787
// rules once it is added.
8888

89+
// Distributed shared memory aliases with shared memory.
90+
if (((AS1 == ADDRESS_SPACE_SHARED) && (AS2 == ADDRESS_SPACE_DSHARED)) ||
91+
((AS1 == ADDRESS_SPACE_DSHARED) && (AS2 == ADDRESS_SPACE_SHARED)))
92+
return AliasResult::MayAlias;
93+
8994
return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias);
9095
}
9196

0 commit comments

Comments
 (0)