Skip to content

Commit 359413a

Browse files
committed
[NVPTX] Add TMA Bulk Copy Intrinsics
This patch adds a new variant of TMA Bulk Copy intrinsics introduced in sm100+. This variant has an additional byte_mask to select the bytes for the copy operation. Signed-off-by: Durgadoss R <[email protected]>
1 parent ffc5f79 commit 359413a

File tree

6 files changed

+123
-27
lines changed

6 files changed

+123
-27
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ Syntax:
616616
.. code-block:: llvm
617617
618618
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch)
619+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(..., i32 %size, i16 %mask, i64 %ch, i1 %flag_ch)
619620
620621
Overview:
621622
"""""""""
@@ -624,7 +625,10 @@ The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic
624625
corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX
625626
instructions. These instructions initiate an asynchronous copy from
626627
shared::cta to global memory. The 32-bit operand ``%size`` specifies
627-
the amount of memory to be copied and it must be a multiple of 16.
628+
the amount of memory to be copied (in bytes) and it must be a multiple
629+
of 16. For the ``.bytemask`` variant, the 16-bit wide mask operand
630+
specifies whether the i-th byte of each 16-byte wide chunk of source
631+
data is copied to the destination.
628632

629633
* The last argument to these intrinsics is a boolean flag
630634
indicating support for cache_hint. This flag argument must

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5323,6 +5323,20 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
53235323
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
53245324
ImmArg<ArgIndex<4>>]>;
53255325

5326+
// From Shared CTA to Global memory with bytemask
5327+
def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
5328+
: DefaultAttrsIntrinsic<[],
5329+
[llvm_global_ptr_ty, // dst_gmem_ptr
5330+
llvm_shared_ptr_ty, // src_smem_ptr
5331+
llvm_i32_ty, // copy_size
5332+
llvm_i16_ty, // byte_mask
5333+
llvm_i64_ty, // cache_hint
5334+
llvm_i1_ty], // Flag for cache_hint
5335+
[IntrConvergent, IntrArgMemOnly,
5336+
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
5337+
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
5338+
ImmArg<ArgIndex<5>>]>;
5339+
53265340
// Intrinsics for Bulk Copy Prefetch L2
53275341
def int_nvvm_cp_async_bulk_prefetch_L2
53285342
: DefaultAttrsIntrinsic<[],

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2720,28 +2720,46 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
27202720
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
27212721
}
27222722

2723-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) {
2723+
void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2GCommon(SDNode *N, bool HasMask) {
27242724
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2725-
// dst, src, size, cache_hint, cache_hint_flag
2725+
// dst, src, size, mask, cache_hint, cache_hint_flag
27262726
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2727-
// = {2} + {5}
2727+
// = {2} + {6}
27282728
size_t NumOps = N->getNumOperands();
27292729
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2730-
size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint
2730+
size_t CacheHintIdx = NumOps - 2;
27312731

27322732
SDLoc DL(N);
2733-
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
2734-
Ops.push_back(N->getOperand(0)); // Chain operand
2733+
SDValue Offset, Base;
2734+
SelectADDR(N->getOperand(3), Base, Offset); // src
27352735

2736-
bool IsShared32 =
2737-
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2738-
unsigned Opcode;
2736+
SmallVector<SDValue, 8> Ops;
2737+
// BaseArgs: {dst, src, size}
2738+
Ops.push_back(N->getOperand(2)); // dst
2739+
Ops.push_back(Base); // src
2740+
Ops.push_back(Offset); // src
2741+
Ops.push_back(N->getOperand(4)); // size
2742+
2743+
// Push Mask operand, if available
2744+
if (HasMask)
2745+
Ops.push_back(N->getOperand(CacheHintIdx - 1));
2746+
2747+
// Push CacheHint operand, if available
27392748
if (IsCacheHint)
2740-
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
2741-
: NVPTX::CP_ASYNC_BULK_S2G_CH;
2742-
else
2743-
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
2744-
: NVPTX::CP_ASYNC_BULK_S2G;
2749+
Ops.push_back(N->getOperand(CacheHintIdx));
2750+
2751+
// Finally, the chain operand
2752+
Ops.push_back(N->getOperand(0));
2753+
2754+
unsigned Opcode = [&]() {
2755+
if (HasMask && IsCacheHint)
2756+
return NVPTX::CP_ASYNC_BULK_S2G_BM_CH;
2757+
if (HasMask)
2758+
return NVPTX::CP_ASYNC_BULK_S2G_BM;
2759+
if (IsCacheHint)
2760+
return NVPTX::CP_ASYNC_BULK_S2G_CH;
2761+
return NVPTX::CP_ASYNC_BULK_S2G;
2762+
}();
27452763
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
27462764
}
27472765

@@ -2928,7 +2946,10 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
29282946
SelectCpAsyncBulkG2S(N);
29292947
return true;
29302948
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
2931-
SelectCpAsyncBulkS2G(N);
2949+
SelectCpAsyncBulkS2GCommon(N);
2950+
return true;
2951+
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global_bytemask:
2952+
SelectCpAsyncBulkS2GCommon(N, /*HasMask=*/true);
29322953
return true;
29332954
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
29342955
SelectCpAsyncBulkPrefetchL2(N);

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9393
void SelectV2I64toI128(SDNode *N);
9494
void SelectI128toV2I64(SDNode *N);
9595
void SelectCpAsyncBulkG2S(SDNode *N);
96-
void SelectCpAsyncBulkS2G(SDNode *N);
96+
void SelectCpAsyncBulkS2GCommon(SDNode *N, bool HasMask = false);
9797
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
9898
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
9999
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,9 @@ class CpAsyncBulkStr<bit mc, bit ch> {
516516
string S2G = "cp.async.bulk.global.shared::cta.bulk_group"
517517
# !if(ch, ".L2::cache_hint", "");
518518

519+
// Shared to Global memory with bytemask
520+
string S2G_BM = S2G # ".cp_mask";
521+
519522
// Global to Shared cluster memory
520523
string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes"
521524
# !if(mc, ".multicast::cluster", "")
@@ -525,18 +528,26 @@ class CpAsyncBulkStr<bit mc, bit ch> {
525528
string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
526529
}
527530

528-
multiclass CP_ASYNC_BULK_S2G<NVPTXRegClass rc> {
529-
def NAME: NVPTXInst<(outs),
530-
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size),
531+
def CP_ASYNC_BULK_S2G : NVPTXInst<(outs),
532+
(ins Int64Regs:$dst, ADDR:$src, Int32Regs:$size),
531533
!strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>,
532534
Requires<[hasPTX<80>, hasSM<90>]>;
533-
def NAME # _CH: NVPTXInst<(outs),
534-
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch),
535-
!strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>,
536-
Requires<[hasPTX<80>, hasSM<90>]>;
537-
}
538-
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G<Int64Regs>;
539-
defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G<Int32Regs>;
535+
536+
def CP_ASYNC_BULK_S2G_CH : NVPTXInst<(outs),
537+
(ins Int64Regs:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
538+
!strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>,
539+
Requires<[hasPTX<80>, hasSM<90>]>;
540+
541+
// Variants with bytemask
542+
def CP_ASYNC_BULK_S2G_BM : NVPTXInst<(outs),
543+
(ins Int64Regs:$dst, ADDR:$src, Int32Regs:$size, Int16Regs:$mask),
544+
!strconcat(CpAsyncBulkStr<0, 0>.S2G_BM, " [$dst], [$src], $size, $mask;"), []>,
545+
Requires<[hasPTX<86>, hasSM<100>]>;
546+
547+
def CP_ASYNC_BULK_S2G_BM_CH : NVPTXInst<(outs),
548+
(ins Int64Regs:$dst, ADDR:$src, Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
549+
!strconcat(CpAsyncBulkStr<0, 1>.S2G_BM, " [$dst], [$src], $size, $ch, $mask;"), []>,
550+
Requires<[hasPTX<86>, hasSM<100>]>;
540551

541552
multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
542553
def NAME: NVPTXInst<(outs),
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
3+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
4+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
5+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %}
6+
7+
target triple = "nvptx64-nvidia-cuda"
8+
9+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1), ptr addrspace(3), i32, i16, i64, i1)
10+
11+
define void @cp_async_bulk_s2g_bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 %ch) {
12+
; CHECK-PTX64-LABEL: cp_async_bulk_s2g_bytemask(
13+
; CHECK-PTX64: {
14+
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
15+
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
16+
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
17+
; CHECK-PTX64-EMPTY:
18+
; CHECK-PTX64-NEXT: // %bb.0:
19+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
20+
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_bytemask_param_1];
21+
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_bytemask_param_2];
22+
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_s2g_bytemask_param_3];
23+
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_s2g_bytemask_param_4];
24+
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%rd2], %r1, %rd3, %rs1;
25+
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%rd2], %r1, %rs1;
26+
; CHECK-PTX64-NEXT: ret;
27+
;
28+
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g_bytemask(
29+
; CHECK-PTX-SHARED32: {
30+
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
31+
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>;
32+
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
33+
; CHECK-PTX-SHARED32-EMPTY:
34+
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
35+
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
36+
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_bytemask_param_1];
37+
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_s2g_bytemask_param_2];
38+
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_s2g_bytemask_param_3];
39+
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_bytemask_param_4];
40+
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%r1], %r2, %rd2, %rs1;
41+
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%r1], %r2, %rs1;
42+
; CHECK-PTX-SHARED32-NEXT: ret;
43+
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 %ch, i1 1)
44+
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 0, i1 0)
45+
ret void
46+
}

0 commit comments

Comments
 (0)