Skip to content

Commit 36c29a8

Browse files
committed
properly remove the existing intrinsics changes
1 parent c0637c6 commit 36c29a8

File tree

10 files changed

+102
-274
lines changed

10 files changed

+102
-274
lines changed

clang/test/CodeGenCUDA/builtins-sm90.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
5050
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
5151
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
5252
out[i++] = (long) __nvvm_mapa(ptr, u);
53-
// CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
53+
// CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
5454
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
5555
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
5656
out[i++] = __nvvm_getctarank(ptr);

llvm/docs/NVPTXUsage.rst

Lines changed: 3 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ The NVPTX back-end uses the following address space mapping:
108108
3 Shared
109109
4 Constant
110110
5 Local
111-
7 Shared Cluster
112111
============= ======================
113112

114113
Every global variable and pointer type is assigned to one of these address
@@ -307,32 +306,6 @@ If the given pointer in the generic address space refers to memory which falls
307306
within the state space of the intrinsic (and therefore could be safely address
308307
space casted to this space), 1 is returned, otherwise 0 is returned.
309308

310-
'``llvm.nvvm.mapa.*``' Intrinsics
311-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
312-
313-
Syntax:
314-
"""""""
315-
316-
.. code-block:: llvm
317-
318-
declare ptr @llvm.nvvm.mapa(ptr %p, i32 %rank)
319-
declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p, i32 %rank)
320-
321-
Overview:
322-
"""""""""
323-
324-
The '``llvm.nvvm.mapa.*``' intrinsics map a shared memory pointer ``p`` of another CTA with ``%rank`` to the current CTA.
325-
The ``llvm.nvvm.mapa`` form expects a generic pointer to shared memory and returns a generic pointer to shared cluster memory.
326-
The ``llvm.nvvm.mapa.shared.cluster`` form expects a pointer to shared memory and returns a pointer to shared cluster memory.
327-
They corresponds directly to the ``mapa`` and ``mapa.shared.cluster`` PTX instructions.
328-
329-
Semantics:
330-
""""""""""
331-
332-
If the given pointer in the generic address space refers to memory which falls
333-
within the state space of the intrinsic (and therefore could be safely address
334-
space casted to this space), 1 is returned, otherwise 0 is returned.
335-
336309
Arithmetic Intrinsics
337310
---------------------
338311

@@ -579,7 +552,7 @@ Syntax:
579552

580553
.. code-block:: llvm
581554
582-
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
555+
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
583556
584557
Overview:
585558
"""""""""
@@ -643,7 +616,7 @@ Syntax:
643616

644617
.. code-block:: llvm
645618
646-
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
619+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
647620
648621
Overview:
649622
"""""""""
@@ -798,7 +771,7 @@ Syntax:
798771

799772
.. code-block:: llvm
800773
801-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
774+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
802775
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
803776
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
804777
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,10 @@
127127
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
128128
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
129129

130-
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131-
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132-
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133-
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
134-
def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
130+
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
131+
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
132+
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
133+
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
135134

136135
//
137136
// MISC
@@ -692,15 +691,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
692691
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
693692
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
694693
list<LLVMType> ArgsTy = !listconcat(
695-
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
696-
llvm_shared_ptr_ty, // mbarrier_smem_ptr
697-
llvm_ptr_ty], // tensormap_ptr
698-
TensorDimsTy, // actual tensor dims
699-
Im2ColOffsetsTy, // im2col offsets
700-
[llvm_i16_ty, // cta_mask
701-
llvm_i64_ty, // cache_hint
702-
llvm_i1_ty, // Flag for cta_mask
703-
llvm_i1_ty] // Flag for cache_hint
694+
[llvm_shared_ptr_ty, // dst_smem_ptr
695+
llvm_shared_ptr_ty, // mbarrier_smem_ptr
696+
llvm_ptr_ty], // tensormap_ptr
697+
TensorDimsTy, // actual tensor dims
698+
Im2ColOffsetsTy, // im2col offsets
699+
[llvm_i16_ty, // cta_mask
700+
llvm_i64_ty, // cache_hint
701+
llvm_i1_ty, // Flag for cta_mask
702+
llvm_i1_ty] // Flag for cache_hint
704703
);
705704

706705
int TempFlagsStartIdx = !add(dim, 5);
@@ -5135,7 +5134,7 @@ def int_nvvm_mapa
51355134
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51365135
"llvm.nvvm.mapa">;
51375136
def int_nvvm_mapa_shared_cluster
5138-
: DefaultAttrsIntrinsic<[llvm_shared_cluster_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
5137+
: DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
51395138
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
51405139
"llvm.nvvm.mapa.shared.cluster">;
51415140
def int_nvvm_getctarank
@@ -5235,14 +5234,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
52355234
// From Global to Shared Cluster
52365235
def int_nvvm_cp_async_bulk_global_to_shared_cluster
52375236
: DefaultAttrsIntrinsic<[],
5238-
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
5239-
llvm_shared_ptr_ty, // mbarrier_ptr
5240-
llvm_global_ptr_ty, // src_gmem_ptr
5241-
llvm_i32_ty, // copy_size
5242-
llvm_i16_ty, // cta_mask
5243-
llvm_i64_ty, // cache_hint
5244-
llvm_i1_ty, // Flag for cta_mask
5245-
llvm_i1_ty], // Flag for cache_hint
5237+
[llvm_shared_ptr_ty, // dst_smem_ptr
5238+
llvm_shared_ptr_ty, // mbarrier_ptr
5239+
llvm_global_ptr_ty, // src_gmem_ptr
5240+
llvm_i32_ty, // copy_size
5241+
llvm_i16_ty, // cta_mask
5242+
llvm_i64_ty, // cache_hint
5243+
llvm_i1_ty, // Flag for cta_mask
5244+
llvm_i1_ty], // Flag for cache_hint
52465245
[IntrConvergent, IntrArgMemOnly,
52475246
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52485247
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5252,10 +5251,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
52525251
// From Shared CTA to Shared Cluster
52535252
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
52545253
: DefaultAttrsIntrinsic<[],
5255-
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
5256-
llvm_shared_ptr_ty, // mbarrier_ptr
5257-
llvm_shared_ptr_ty, // src_smem_ptr
5258-
llvm_i32_ty], // copy_size
5254+
[llvm_shared_ptr_ty, // dst_smem_ptr
5255+
llvm_shared_ptr_ty, // mbarrier_ptr
5256+
llvm_shared_ptr_ty, // src_smem_ptr
5257+
llvm_i32_ty], // copy_size
52595258
[IntrConvergent, IntrArgMemOnly,
52605259
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
52615260
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
#include "llvm/Support/AMDGPUAddrSpace.h"
4747
#include "llvm/Support/CommandLine.h"
4848
#include "llvm/Support/ErrorHandling.h"
49-
#include "llvm/Support/NVPTXAddrSpace.h"
5049
#include "llvm/Support/Regex.h"
5150
#include "llvm/TargetParser/Triple.h"
5251
#include <cstdint>
@@ -939,47 +938,6 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
939938
return false; // No other 'arm.*', 'aarch64.*'.
940939
}
941940

942-
static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
943-
StringRef Name) {
944-
if (Name.consume_front("mapa.shared.cluster"))
945-
if (F->getReturnType()->getPointerAddressSpace() ==
946-
NVPTXAS::ADDRESS_SPACE_SHARED)
947-
return Intrinsic::nvvm_mapa_shared_cluster;
948-
949-
if (Name.consume_front("cp.async.bulk.")) {
950-
Intrinsic::ID ID =
951-
StringSwitch<Intrinsic::ID>(Name)
952-
.Case("global.to.shared.cluster",
953-
Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
954-
.Case("shared.cta.to.cluster",
955-
Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
956-
.Case("tensor.g2s.im2col.3d",
957-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
958-
.Case("tensor.g2s.im2col.4d",
959-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
960-
.Case("tensor.g2s.im2col.5d",
961-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
962-
.Case("tensor.g2s.tile.1d",
963-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
964-
.Case("tensor.g2s.tile.2d",
965-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
966-
.Case("tensor.g2s.tile.3d",
967-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
968-
.Case("tensor.g2s.tile.4d",
969-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
970-
.Case("tensor.g2s.tile.5d",
971-
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
972-
.Default(Intrinsic::not_intrinsic);
973-
974-
if (ID != Intrinsic::not_intrinsic)
975-
if (F->getArg(0)->getType()->getPointerAddressSpace() ==
976-
NVPTXAS::ADDRESS_SPACE_SHARED)
977-
return ID;
978-
}
979-
980-
return Intrinsic::not_intrinsic;
981-
}
982-
983941
static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
984942
if (Name.consume_front("fma.rn."))
985943
return StringSwitch<Intrinsic::ID>(Name)
@@ -1320,14 +1278,6 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13201278
}
13211279
}
13221280

1323-
// Upgrade Distributed Shared Memory Intrinsics
1324-
Intrinsic::ID IID = shouldUpgradeNVPTXSharedClusterIntrinsic(F, Name);
1325-
if (IID != Intrinsic::not_intrinsic) {
1326-
rename(F);
1327-
NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
1328-
return true;
1329-
}
1330-
13311281
// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
13321282
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
13331283
//
@@ -4768,39 +4718,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47684718
CI->eraseFromParent();
47694719
return;
47704720
}
4771-
case Intrinsic::nvvm_mapa_shared_cluster: {
4772-
// Create a new call with the correct address space.
4773-
NewCall =
4774-
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
4775-
Value *Res = NewCall;
4776-
Res = Builder.CreateAddrSpaceCast(
4777-
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
4778-
NewCall->takeName(CI);
4779-
CI->replaceAllUsesWith(Res);
4780-
CI->eraseFromParent();
4781-
return;
4782-
}
4783-
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
4784-
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
4785-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4786-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4787-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4788-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
4789-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
4790-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
4791-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
4792-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
4793-
// Create a new call with the correct address space.
4794-
SmallVector<Value *, 4> Args(CI->args());
4795-
Args[0] = Builder.CreateAddrSpaceCast(
4796-
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
4797-
4798-
NewCall = Builder.CreateCall(NewFn, Args);
4799-
NewCall->takeName(CI);
4800-
CI->replaceAllUsesWith(NewCall);
4801-
CI->eraseFromParent();
4802-
return;
4803-
}
48044721
case Intrinsic::riscv_sha256sig0:
48054722
case Intrinsic::riscv_sha256sig1:
48064723
case Intrinsic::riscv_sha256sum0:

llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,6 @@ declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32)
5959
declare i32 @llvm.nvvm.atomic.load.inc.32(ptr, i32)
6060
declare i32 @llvm.nvvm.atomic.load.dec.32(ptr, i32)
6161

62-
declare ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3), i32)
63-
64-
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
65-
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)
66-
67-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
68-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
69-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
70-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
71-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
72-
73-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
74-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
75-
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
76-
7762
; CHECK-LABEL: @simple_upgrade
7863
define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
7964
; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -269,45 +254,3 @@ define i32 @atomics(ptr %p0, i32 %a) {
269254
ret i32 %r2
270255
}
271256

272-
; CHECK-LABEL: @nvvm_shared_cluster_intrinsics
273-
define void @nvvm_shared_cluster_intrinsics(ptr addrspace(3) %p0, i32 %offset) {
274-
; CHECK: %r = call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p0, i32 %offset)
275-
%r = call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p0, i32 %offset)
276-
ret void
277-
}
278-
279-
; CHECK-LABEL: @nvvm_cp_async_bulk_intrinsics
280-
define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, ptr addrspace(3) %src_shared, i32 %size) {
281-
; CHECK: call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
282-
; CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
283-
call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
284-
call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
285-
ret void
286-
}
287-
288-
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col
289-
define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
290-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false)
291-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false)
292-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false)
293-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 0, i1 0)
294-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 0, i1 0)
295-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 0, i1 0)
296-
ret void
297-
}
298-
299-
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile
300-
define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
301-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false)
302-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false)
303-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false)
304-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false)
305-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false)
306-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 0, i1 0)
307-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 0, i1 0)
308-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 0, i1 0)
309-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 0, i1 0)
310-
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 0, i1 0)
311-
ret void
312-
}
313-

0 commit comments

Comments
 (0)