@@ -4157,9 +4157,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
41574157 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
41584158 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
41594159
4160- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G ( dim, mode ) \
4161- (IsCacheHint ? (CP_ASYNC_BULK_TENSOR_OPCODE(S2G , dim, mode, _CH)) \
4162- : (CP_ASYNC_BULK_TENSOR_OPCODE(S2G , dim, mode, )))
4160+ #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH ( op, dim, mode ) \
4161+ (IsCacheHint ? (CP_ASYNC_BULK_TENSOR_OPCODE(op , dim, mode, _CH)) \
4162+ : (CP_ASYNC_BULK_TENSOR_OPCODE(op , dim, mode, )))
41634163
41644164#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S (dim, mode ) \
41654165 [&]() -> auto { \
@@ -4177,31 +4177,40 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
41774177 : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
41784178
41794179static unsigned GetCpAsyncBulkTensorS2GOpcode (size_t Dim, bool IsShared32,
4180- bool IsCacheHint, bool IsIm2Col) {
4180+ bool IsCacheHint, bool IsIm2Col,
4181+ bool IsReduce = false ) {
41814182 if (IsIm2Col) {
41824183 switch (Dim) {
41834184 case 3 :
4184- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, IM2COL);
4185+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 3D, IM2COL)
4186+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 3D, IM2COL);
41854187 case 4 :
4186- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, IM2COL);
4188+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 4D, IM2COL)
4189+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 4D, IM2COL);
41874190 case 5 :
4188- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, IM2COL);
4191+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 5D, IM2COL)
4192+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 5D, IM2COL);
41894193 default :
41904194 llvm_unreachable (" Invalid Dimension in im2col mode for "
41914195 " GetCpAsyncBulkTensorS2GOpcode." );
41924196 }
41934197 } else {
41944198 switch (Dim) {
41954199 case 1 :
4196- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (1D, TILE);
4200+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 1D, TILE)
4201+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 1D, TILE);
41974202 case 2 :
4198- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (2D, TILE);
4203+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 2D, TILE)
4204+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 2D, TILE);
41994205 case 3 :
4200- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, TILE);
4206+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 3D, TILE)
4207+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 3D, TILE);
42014208 case 4 :
4202- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, TILE);
4209+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 4D, TILE)
4210+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 4D, TILE);
42034211 case 5 :
4204- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, TILE);
4212+ return IsReduce ? GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (RED, 5D, TILE)
4213+ : GET_CP_ASYNC_BULK_TENSOR_OPCODE_CH (S2G, 5D, TILE);
42054214 default :
42064215 llvm_unreachable (
42074216 " Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode." );
@@ -4377,6 +4386,30 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
43774386 ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
43784387}
43794388
4389+ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon (SDNode *N,
4390+ bool IsIm2Col) {
4391+ // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
4392+ // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag, reduction_kind_flag
4393+ // NumOperands = {Chain, IID} + {Actual intrinsic args}
4394+ // = {2} + {5 + dims}
4395+ size_t NumOps = N->getNumOperands ();
4396+ size_t NumDims = NumOps - 7 ;
4397+ unsigned ReductionKind = N->getConstantOperandVal (NumOps - 1 );
4398+ bool IsCacheHint = N->getConstantOperandVal (NumOps - 2 ) == 1 ;
4399+ size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2 ); // src, dst, cache_hint
4400+
4401+ SDLoc DL (N);
4402+ SmallVector<SDValue, 12 > Ops (N->ops ().slice (2 , NumArgs));
4403+ Ops.push_back (getI32Imm (ReductionKind, DL)); // Reduction Op
4404+ Ops.push_back (N->getOperand (0 )); // Chain operand
4405+
4406+ bool IsShared32 =
4407+ CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
4408+ unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode (
4409+ NumDims, IsShared32, IsCacheHint, IsIm2Col, /* IsReduce=*/ true );
4410+ ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
4411+ }
4412+
43804413bool NVPTXDAGToDAGISel::tryIntrinsicVoid (SDNode *N) {
43814414 unsigned IID = N->getConstantOperandVal (1 );
43824415 switch (IID) {
@@ -4418,5 +4451,17 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
44184451 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
44194452 SelectCpAsyncBulkTensorPrefetchCommon (N, /* IsIm2Col=*/ true );
44204453 return true ;
4454+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_tile_1d:
4455+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_tile_2d:
4456+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_tile_3d:
4457+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_tile_4d:
4458+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_tile_5d:
4459+ SelectCpAsyncBulkTensorReduceCommon (N);
4460+ return true ;
4461+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_im2col_3d:
4462+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_im2col_4d:
4463+ case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_im2col_5d:
4464+ SelectCpAsyncBulkTensorReduceCommon (N, /* IsIm2Col=*/ true );
4465+ return true ;
44214466 }
44224467}
0 commit comments