Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,34 @@ it must be a multiple of 16.
For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.

'``llvm.nvvm.cp.async.bulk.prefetch.L2``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch)

Overview:
"""""""""

The '``@llvm.nvvm.cp.async.bulk.prefetch.L2``' intrinsic
corresponds to the ``cp.async.bulk.prefetch.L2.*`` family
of PTX instructions. These instructions initiate an asynchronous
prefetch of bulk data from global memory to the L2 cache.
The 32-bit operand ``%size`` specifies the amount of memory to be
prefetched in terms of bytes and it must be a multiple of 16.

* The last argument to these intrinsics is boolean flag indicating
support for cache_hint. These flag argument must be compile-time
constant. When set, indicates a valid cache_hint (``i64 %ch``)
and generates the ``.L2::cache_hint`` variant of the PTX instruction.

For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch>`_.

'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
11 changes: 11 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -5033,4 +5033,15 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<4>>]>;

// Intrinsics for Bulk Copy Prefetch L2
def int_nvvm_cp_async_bulk_prefetch_L2
: DefaultAttrsIntrinsic<[],
[llvm_global_ptr_ty, // src_gmem_ptr
llvm_i32_ty, // copy_size
llvm_i64_ty, // cache_hint
llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>,
ImmArg<ArgIndex<3>>]>;

} // let TargetPrefix = "nvvm"
24 changes: 24 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3105,6 +3105,27 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
// src, size, cache_hint, cache_hint_flag
// NumOperands = {Chain, IID} + {Actual intrinsic args}
// = {2} + {4}
size_t NumOps = N->getNumOperands();
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint

SDLoc DL(N);
SmallVector<SDValue, 4> Ops(N->ops().slice(2, NumArgs));
Ops.push_back(N->getOperand(0)); // Chain operand

unsigned Opcode;
if (IsCacheHint)
Opcode = NVPTX::CP_ASYNC_BULK_PREFETCH_CH;
else
Opcode = NVPTX::CP_ASYNC_BULK_PREFETCH;
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
unsigned IID = N->getConstantOperandVal(1);
using TMARedTy = llvm::nvvm::TMAReductionOp;
Expand All @@ -3118,6 +3139,9 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
SelectCpAsyncBulkS2G(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
SelectCpAsyncBulkPrefetchL2(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectI128toV2I64(SDNode *N);
void SelectCpAsyncBulkG2S(SDNode *N);
void SelectCpAsyncBulkS2G(SDNode *N);
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
Expand Down
19 changes: 19 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,25 @@ multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs>;
defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs>;

//------------------------------
// Bulk Copy Prefetch Functions
//------------------------------
multiclass CP_ASYNC_BULK_PREFETCH_INTR {
defvar prefetch = "cp.async.bulk.prefetch.L2.global";
def NAME: NVPTXInst<(outs),
(ins Int64Regs:$src, Int32Regs:$size),
!strconcat(prefetch," [$src], $size;"),
[]>,
Requires<[hasPTX<80>, hasSM<90>]>;
def NAME # _CH: NVPTXInst<(outs),
(ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
!strconcat(prefetch,".L2::cache_hint [$src], $size, $ch;"),
[]>,
Requires<[hasPTX<80>, hasSM<90>]>;
}

defm CP_ASYNC_BULK_PREFETCH : CP_ASYNC_BULK_PREFETCH_INTR;

//-------------------------------------
// TMA Async Bulk Tensor Copy Functions
//-------------------------------------
Expand Down
19 changes: 19 additions & 0 deletions llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ target triple = "nvptx64-nvidia-cuda"
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)
declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1), i32, i64, i1)

define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) {
; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
Expand Down Expand Up @@ -116,3 +117,21 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size)
ret void
}

define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) {
; CHECK-PTX64-LABEL: cp_async_bulk_prefetch(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0];
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1];
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2];
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2;
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1;
; CHECK-PTX64-NEXT: ret;
tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0)
ret void
}
Loading