Skip to content

Commit 5a1c217

Browse files
committed
initial implementation
1 parent 93370c4 commit 5a1c217

File tree

23 files changed

+558
-112
lines changed

23 files changed

+558
-112
lines changed

clang/lib/Basic/Targets/NVPTX.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
7171

7272
if (TargetPointerWidth == 32)
7373
resetDataLayout(
74-
"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
74+
"e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
7575
else if (Opts.NVPTXUseShortPointers)
76-
resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
77-
"16-v32:32-n16:32:64");
76+
resetDataLayout(
77+
"e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:"
78+
"16-v32:32-n16:32:64");
7879
else
7980
resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
8081

clang/test/CodeGen/target-data.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@
160160

161161
// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
162162
// RUN: FileCheck %s -check-prefix=NVPTX
163-
// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
163+
// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
164164

165165
// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
166166
// RUN: FileCheck %s -check-prefix=NVPTX64

clang/test/CodeGenCUDA/builtins-sm90.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
5050
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
5151
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
5252
out[i++] = (long) __nvvm_mapa(ptr, u);
53-
// CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
53+
// CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
5454
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
5555
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
5656
out[i++] = __nvvm_getctarank(ptr);

llvm/docs/NVPTXUsage.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,7 @@ Syntax:
499499

500500
.. code-block:: llvm
501501
502-
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
502+
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
503503
504504
Overview:
505505
"""""""""
@@ -563,7 +563,7 @@ Syntax:
563563

564564
.. code-block:: llvm
565565
566-
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
566+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
567567
568568
Overview:
569569
"""""""""
@@ -718,7 +718,7 @@ Syntax:
718718

719719
.. code-block:: llvm
720720
721-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
721+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
722722
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
723723
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
724724
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131131
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132132
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133133
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
134+
def llvm_dshared_ptr_ty : LLVMQualPointerType<7>; // (dshared)ptr
134135

135136
//
136137
// MISC
@@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
691692
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
692693
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
693694
list<LLVMType> ArgsTy = !listconcat(
694-
[llvm_shared_ptr_ty, // dst_smem_ptr
695-
llvm_shared_ptr_ty, // mbarrier_smem_ptr
696-
llvm_ptr_ty], // tensormap_ptr
697-
TensorDimsTy, // actual tensor dims
698-
Im2ColOffsetsTy, // im2col offsets
699-
[llvm_i16_ty, // cta_mask
700-
llvm_i64_ty, // cache_hint
701-
llvm_i1_ty, // Flag for cta_mask
702-
llvm_i1_ty] // Flag for cache_hint
695+
[llvm_dshared_ptr_ty, // dst_smem_ptr
696+
llvm_shared_ptr_ty, // mbarrier_smem_ptr
697+
llvm_ptr_ty], // tensormap_ptr
698+
TensorDimsTy, // actual tensor dims
699+
Im2ColOffsetsTy, // im2col offsets
700+
[llvm_i16_ty, // cta_mask
701+
llvm_i64_ty, // cache_hint
702+
llvm_i1_ty, // Flag for cta_mask
703+
llvm_i1_ty] // Flag for cache_hint
703704
);
704705

705706
int TempFlagsStartIdx = !add(dim, 5);
@@ -5087,7 +5088,7 @@ def int_nvvm_mapa
50875088
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
50885089
"llvm.nvvm.mapa">;
50895090
def int_nvvm_mapa_shared_cluster
5090-
: DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
5091+
: DefaultAttrsIntrinsic<[llvm_dshared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
50915092
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
50925093
"llvm.nvvm.mapa.shared.cluster">;
50935094
def int_nvvm_getctarank
@@ -5187,14 +5188,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
51875188
// From Global to Shared Cluster
51885189
def int_nvvm_cp_async_bulk_global_to_shared_cluster
51895190
: DefaultAttrsIntrinsic<[],
5190-
[llvm_shared_ptr_ty, // dst_smem_ptr
5191-
llvm_shared_ptr_ty, // mbarrier_ptr
5192-
llvm_global_ptr_ty, // src_gmem_ptr
5193-
llvm_i32_ty, // copy_size
5194-
llvm_i16_ty, // cta_mask
5195-
llvm_i64_ty, // cache_hint
5196-
llvm_i1_ty, // Flag for cta_mask
5197-
llvm_i1_ty], // Flag for cache_hint
5191+
[llvm_dshared_ptr_ty, // dst_dsmem_ptr
5192+
llvm_shared_ptr_ty, // mbarrier_ptr
5193+
llvm_global_ptr_ty, // src_gmem_ptr
5194+
llvm_i32_ty, // copy_size
5195+
llvm_i16_ty, // cta_mask
5196+
llvm_i64_ty, // cache_hint
5197+
llvm_i1_ty, // Flag for cta_mask
5198+
llvm_i1_ty], // Flag for cache_hint
51985199
[IntrConvergent, IntrArgMemOnly,
51995200
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52005201
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5204,10 +5205,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
52045205
// From Shared CTA to Shared Cluster
52055206
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
52065207
: DefaultAttrsIntrinsic<[],
5207-
[llvm_shared_ptr_ty, // dst_smem_ptr
5208-
llvm_shared_ptr_ty, // mbarrier_ptr
5209-
llvm_shared_ptr_ty, // src_smem_ptr
5210-
llvm_i32_ty], // copy_size
5208+
[llvm_dshared_ptr_ty, // dst_dsmem_ptr
5209+
llvm_shared_ptr_ty, // mbarrier_ptr
5210+
llvm_shared_ptr_ty, // src_smem_ptr
5211+
llvm_i32_ty], // copy_size
52115212
[IntrConvergent, IntrArgMemOnly,
52125213
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52135214
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,

llvm/include/llvm/Support/NVPTXAddrSpace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ enum AddressSpace : unsigned {
2525
ADDRESS_SPACE_CONST = 4,
2626
ADDRESS_SPACE_LOCAL = 5,
2727
ADDRESS_SPACE_TENSOR = 6,
28+
ADDRESS_SPACE_DSHARED = 7,
2829

2930
ADDRESS_SPACE_PARAM = 101,
3031
};

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "llvm/Support/AMDGPUAddrSpace.h"
4747
#include "llvm/Support/CommandLine.h"
4848
#include "llvm/Support/ErrorHandling.h"
49+
#include "llvm/Support/NVPTXAddrSpace.h"
4950
#include "llvm/Support/Regex.h"
5051
#include "llvm/TargetParser/Triple.h"
5152
#include <cstdint>
@@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
938939
return false; // No other 'arm.*', 'aarch64.*'.
939940
}
940941

942+
static Intrinsic::ID shouldUpgradeNVPTXDSharedIntrinsic(Function *F,
943+
StringRef Name) {
944+
if (Name.consume_front("mapa.shared.cluster"))
945+
if (F->getReturnType()->getPointerAddressSpace() ==
946+
NVPTXAS::ADDRESS_SPACE_SHARED)
947+
return Intrinsic::nvvm_mapa_shared_cluster;
948+
949+
if (Name.consume_front("cp.async.bulk.")) {
950+
Intrinsic::ID ID =
951+
StringSwitch<Intrinsic::ID>(Name)
952+
.Case("global.to.shared.cluster",
953+
Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
954+
.Case("shared.cta.to.cluster",
955+
Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
956+
.Case("tensor.g2s.im2col.3d",
957+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
958+
.Case("tensor.g2s.im2col.4d",
959+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
960+
.Case("tensor.g2s.im2col.5d",
961+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
962+
.Case("tensor.g2s.tile.1d",
963+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
964+
.Case("tensor.g2s.tile.2d",
965+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
966+
.Case("tensor.g2s.tile.3d",
967+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
968+
.Case("tensor.g2s.tile.4d",
969+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
970+
.Case("tensor.g2s.tile.5d",
971+
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
972+
.Default(Intrinsic::not_intrinsic);
973+
974+
if (ID != Intrinsic::not_intrinsic)
975+
if (F->getArg(0)->getType()->getPointerAddressSpace() ==
976+
NVPTXAS::ADDRESS_SPACE_SHARED)
977+
return ID;
978+
}
979+
980+
return Intrinsic::not_intrinsic;
981+
}
982+
941983
static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
942984
if (Name.consume_front("abs."))
943985
return StringSwitch<Intrinsic::ID>(Name)
@@ -1284,6 +1326,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
12841326
}
12851327
}
12861328

1329+
// Upgrade Distributed Shared Memory Intrinsics
1330+
Intrinsic::ID IID = shouldUpgradeNVPTXDSharedIntrinsic(F, Name);
1331+
if (IID != Intrinsic::not_intrinsic) {
1332+
rename(F);
1333+
NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
1334+
return true;
1335+
}
1336+
12871337
// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
12881338
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
12891339
//
@@ -4704,6 +4754,43 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47044754
CI->eraseFromParent();
47054755
return;
47064756
}
4757+
case Intrinsic::nvvm_mapa_shared_cluster: {
4758+
// Create a new call with the correct address space.
4759+
NewCall =
4760+
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
4761+
Value *Res = NewCall;
4762+
Res = Builder.CreateAddrSpaceCast(
4763+
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
4764+
Res = Builder.CreateAddrSpaceCast(
4765+
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
4766+
NewCall->takeName(CI);
4767+
CI->replaceAllUsesWith(Res);
4768+
CI->eraseFromParent();
4769+
return;
4770+
}
4771+
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
4772+
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
4773+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4774+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4775+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4776+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
4777+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
4778+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
4779+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
4780+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
4781+
4782+
SmallVector<Value *, 4> Args(CI->args());
4783+
Args[0] = Builder.CreateAddrSpaceCast(
4784+
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
4785+
Args[0] = Builder.CreateAddrSpaceCast(
4786+
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_DSHARED));
4787+
4788+
NewCall = Builder.CreateCall(NewFn, Args);
4789+
NewCall->takeName(CI);
4790+
CI->replaceAllUsesWith(NewCall);
4791+
CI->eraseFromParent();
4792+
return;
4793+
}
47074794
case Intrinsic::riscv_sha256sig0:
47084795
case Intrinsic::riscv_sha256sig1:
47094796
case Intrinsic::riscv_sha256sum0:

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
288288
case NVPTX::AddressSpace::Global:
289289
case NVPTX::AddressSpace::Const:
290290
case NVPTX::AddressSpace::Shared:
291+
case NVPTX::AddressSpace::Dshared:
291292
case NVPTX::AddressSpace::Param:
292293
case NVPTX::AddressSpace::Local:
293294
O << "." << A;

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
176176
Shared = 3,
177177
Const = 4,
178178
Local = 5,
179+
Dshared = 7,
179180

180181
// NVPTX Backend Private:
181182
Param = 101

llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) {
8686
// TODO: cvta.param is not yet supported. We need to change aliasing
8787
// rules once it is added.
8888

89+
// Distributed shared memory aliases with shared memory.
90+
if (((AS1 == ADDRESS_SPACE_SHARED) && (AS2 == ADDRESS_SPACE_DSHARED)) ||
91+
((AS1 == ADDRESS_SPACE_DSHARED) && (AS2 == ADDRESS_SPACE_SHARED)))
92+
return AliasResult::MayAlias;
93+
8994
return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias);
9095
}
9196

0 commit comments

Comments
 (0)