@@ -2157,16 +2157,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21572157 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
21582158 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
21592159
2160- #define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL (op, dim, mode, is_ch, is_s32 ) \
2161- (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2162- : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2163-
2164- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (dim, mode, is_reduce, is_ch, \
2165- is_s32) \
2166- (is_reduce \
2167- ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2168- : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2169- is_s32)))
2160+ #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (dim, mode, is_ch, is_s32 ) \
2161+ (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
2162+ : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
21702163
21712164#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S (dim, mode, is_mc, is_ch, is_s32 ) \
21722165 [&]() -> auto { \
@@ -2179,48 +2172,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21792172 return CP_ASYNC_BULK_TENSOR_OPCODE (G2S, dim, mode, is_s32, ); \
21802173 }()
21812174
2182- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (dim, mode, is_ch ) \
2183- (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2184- : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2185-
2186- static unsigned GetCpAsyncBulkTensorS2GOpcode (size_t Dim, bool IsShared32,
2187- bool IsCacheHint, bool IsIm2Col,
2188- bool IsReduce = false ) {
2175+ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode (size_t Dim,
2176+ bool IsShared32,
2177+ bool IsCacheHint,
2178+ bool IsIm2Col) {
21892179 if (IsIm2Col) {
21902180 switch (Dim) {
21912181 case 3 :
2192- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, IM2COL, IsReduce ,
2193- IsCacheHint, IsShared32);
2182+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, IM2COL, IsCacheHint ,
2183+ IsShared32);
21942184 case 4 :
2195- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, IM2COL, IsReduce ,
2196- IsCacheHint, IsShared32);
2185+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, IM2COL, IsCacheHint ,
2186+ IsShared32);
21972187 case 5 :
2198- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, IM2COL, IsReduce ,
2199- IsCacheHint, IsShared32);
2188+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, IM2COL, IsCacheHint ,
2189+ IsShared32);
22002190 default :
22012191 llvm_unreachable (" Invalid Dimension in im2col mode for "
2202- " GetCpAsyncBulkTensorS2GOpcode ." );
2192+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
22032193 }
22042194 } else {
22052195 switch (Dim) {
22062196 case 1 :
2207- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (1D, TILE, IsReduce ,
2208- IsCacheHint, IsShared32);
2197+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (1D, TILE, IsCacheHint ,
2198+ IsShared32);
22092199 case 2 :
2210- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (2D, TILE, IsReduce ,
2211- IsCacheHint, IsShared32);
2200+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (2D, TILE, IsCacheHint ,
2201+ IsShared32);
22122202 case 3 :
2213- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, TILE, IsReduce ,
2214- IsCacheHint, IsShared32);
2203+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, TILE, IsCacheHint ,
2204+ IsShared32);
22152205 case 4 :
2216- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, TILE, IsReduce ,
2217- IsCacheHint, IsShared32);
2206+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, TILE, IsCacheHint ,
2207+ IsShared32);
22182208 case 5 :
2219- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, TILE, IsReduce ,
2220- IsCacheHint, IsShared32);
2209+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, TILE, IsCacheHint ,
2210+ IsShared32);
22212211 default :
2222- llvm_unreachable (
2223- " Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode ." );
2212+ llvm_unreachable (" Invalid Dimension in tile mode for "
2213+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
22242214 }
22252215 }
22262216}
@@ -2267,39 +2257,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
22672257 }
22682258}
22692259
2270- static unsigned GetCpAsyncBulkTensorPrefetchOpcode (size_t Dim, bool IsCacheHint,
2271- bool IsIm2Col) {
2272- if (IsIm2Col) {
2273- switch (Dim) {
2274- case 3 :
2275- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, IM2COL, IsCacheHint);
2276- case 4 :
2277- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, IM2COL, IsCacheHint);
2278- case 5 :
2279- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, IM2COL, IsCacheHint);
2280- default :
2281- llvm_unreachable (" Invalid Dimension in im2col mode for "
2282- " GetCpAsyncBulkTensorPrefetchOpcode." );
2283- }
2284- } else {
2285- switch (Dim) {
2286- case 1 :
2287- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (1D, TILE, IsCacheHint);
2288- case 2 :
2289- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (2D, TILE, IsCacheHint);
2290- case 3 :
2291- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, TILE, IsCacheHint);
2292- case 4 :
2293- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, TILE, IsCacheHint);
2294- case 5 :
2295- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, TILE, IsCacheHint);
2296- default :
2297- llvm_unreachable (" Invalid Dimension in tile mode for "
2298- " GetCpAsyncBulkTensorPrefetchOpcode." );
2299- }
2300- }
2301- }
2302-
23032260static size_t GetDimsFromIntrinsic (unsigned IID) {
23042261 switch (IID) {
23052262 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
@@ -2364,52 +2321,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
23642321 ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
23652322}
23662323
2367- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon (SDNode *N,
2368- bool IsIm2Col) {
2369- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2370- // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2371- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2372- // = {2} + {4 + dims}
2373- size_t NumOps = N->getNumOperands ();
2374- size_t NumDims = NumOps - 6 ;
2375- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2376- size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2 ); // src, dst, cache_hint
2377-
2378- SDLoc DL (N);
2379- SmallVector<SDValue, 8 > Ops (N->ops ().slice (2 , NumArgs));
2380- Ops.push_back (N->getOperand (0 )); // Chain operand
2381-
2382- bool IsShared32 =
2383- CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2384- unsigned Opcode =
2385- GetCpAsyncBulkTensorS2GOpcode (NumDims, IsShared32, IsCacheHint, IsIm2Col);
2386- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2387- }
2388-
2389- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon (SDNode *N,
2390- bool IsIm2Col) {
2391- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2392- // {src, dims{d0...dN}, im2col_offsets{dims-2}
2393- // cache_hint, cache_hint_flag}
2394- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2395- // = {2} + {3 + dims + im2col_offsets}
2396- size_t NumOps = N->getNumOperands ();
2397- size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic (N->getConstantOperandVal (1 ))
2398- : (NumOps - 5 );
2399- // Offsets is always 'NumDims - 2' and only for im2col mode
2400- size_t NumOffsets = IsIm2Col ? (NumDims - 2 ) : 0 ;
2401- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2402- size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1 );
2403-
2404- SDLoc DL (N);
2405- SmallVector<SDValue, 12 > Ops (N->ops ().slice (2 , NumArgs));
2406- Ops.push_back (N->getOperand (0 )); // Chain operand
2407-
2408- unsigned Opcode =
2409- GetCpAsyncBulkTensorPrefetchOpcode (NumDims, IsCacheHint, IsIm2Col);
2410- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2411- }
2412-
24132324void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon (SDNode *N,
24142325 unsigned RedOp,
24152326 bool IsIm2Col) {
@@ -2429,8 +2340,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
24292340
24302341 bool IsShared32 =
24312342 CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2432- unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode (
2433- NumDims, IsShared32, IsCacheHint, IsIm2Col, /* IsReduce= */ true );
2343+ unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode (
2344+ NumDims, IsShared32, IsCacheHint, IsIm2Col);
24342345 ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
24352346}
24362347
@@ -2550,18 +2461,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25502461 switch (IID) {
25512462 default :
25522463 return false ;
2553- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2554- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2555- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2556- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2557- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2558- SelectCpAsyncBulkTensorS2GCommon (N);
2559- return true ;
2560- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2561- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2562- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2563- SelectCpAsyncBulkTensorS2GCommon (N, /* IsIm2Col=*/ true );
2564- return true ;
25652464 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
25662465 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
25672466 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
@@ -2574,18 +2473,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25742473 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
25752474 SelectCpAsyncBulkTensorG2SCommon (N, /* IsIm2Col=*/ true );
25762475 return true ;
2577- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2578- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2579- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2580- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2581- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2582- SelectCpAsyncBulkTensorPrefetchCommon (N);
2583- return true ;
2584- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2585- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2586- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2587- SelectCpAsyncBulkTensorPrefetchCommon (N, /* IsIm2Col=*/ true );
2588- return true ;
25892476 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
25902477 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
25912478 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
0 commit comments