@@ -2147,16 +2147,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21472147 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
21482148 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
21492149
2150- #define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL (op, dim, mode, is_ch, is_s32 ) \
2151- (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2152- : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2153-
2154- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (dim, mode, is_reduce, is_ch, \
2155- is_s32) \
2156- (is_reduce \
2157- ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2158- : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2159- is_s32)))
2150+ #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (dim, mode, is_ch, is_s32 ) \
2151+ (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
2152+ : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
21602153
21612154#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S (dim, mode, is_mc, is_ch, is_s32 ) \
21622155 [&]() -> auto { \
@@ -2169,48 +2162,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21692162 return CP_ASYNC_BULK_TENSOR_OPCODE (G2S, dim, mode, is_s32, ); \
21702163 }()
21712164
2172- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (dim, mode, is_ch ) \
2173- (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2174- : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2175-
2176- static unsigned GetCpAsyncBulkTensorS2GOpcode (size_t Dim, bool IsShared32,
2177- bool IsCacheHint, bool IsIm2Col,
2178- bool IsReduce = false ) {
2165+ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode (size_t Dim,
2166+ bool IsShared32,
2167+ bool IsCacheHint,
2168+ bool IsIm2Col) {
21792169 if (IsIm2Col) {
21802170 switch (Dim) {
21812171 case 3 :
2182- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, IM2COL, IsReduce ,
2183- IsCacheHint, IsShared32);
2172+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, IM2COL, IsCacheHint ,
2173+ IsShared32);
21842174 case 4 :
2185- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, IM2COL, IsReduce ,
2186- IsCacheHint, IsShared32);
2175+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, IM2COL, IsCacheHint ,
2176+ IsShared32);
21872177 case 5 :
2188- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, IM2COL, IsReduce ,
2189- IsCacheHint, IsShared32);
2178+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, IM2COL, IsCacheHint ,
2179+ IsShared32);
21902180 default :
21912181 llvm_unreachable (" Invalid Dimension in im2col mode for "
2192- " GetCpAsyncBulkTensorS2GOpcode ." );
2182+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
21932183 }
21942184 } else {
21952185 switch (Dim) {
21962186 case 1 :
2197- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (1D, TILE, IsReduce ,
2198- IsCacheHint, IsShared32);
2187+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (1D, TILE, IsCacheHint ,
2188+ IsShared32);
21992189 case 2 :
2200- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (2D, TILE, IsReduce ,
2201- IsCacheHint, IsShared32);
2190+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (2D, TILE, IsCacheHint ,
2191+ IsShared32);
22022192 case 3 :
2203- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, TILE, IsReduce ,
2204- IsCacheHint, IsShared32);
2193+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, TILE, IsCacheHint ,
2194+ IsShared32);
22052195 case 4 :
2206- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, TILE, IsReduce ,
2207- IsCacheHint, IsShared32);
2196+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, TILE, IsCacheHint ,
2197+ IsShared32);
22082198 case 5 :
2209- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, TILE, IsReduce ,
2210- IsCacheHint, IsShared32);
2199+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, TILE, IsCacheHint ,
2200+ IsShared32);
22112201 default :
2212- llvm_unreachable (
2213- " Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode ." );
2202+ llvm_unreachable (" Invalid Dimension in tile mode for "
2203+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
22142204 }
22152205 }
22162206}
@@ -2257,39 +2247,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
22572247 }
22582248}
22592249
2260- static unsigned GetCpAsyncBulkTensorPrefetchOpcode (size_t Dim, bool IsCacheHint,
2261- bool IsIm2Col) {
2262- if (IsIm2Col) {
2263- switch (Dim) {
2264- case 3 :
2265- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, IM2COL, IsCacheHint);
2266- case 4 :
2267- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, IM2COL, IsCacheHint);
2268- case 5 :
2269- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, IM2COL, IsCacheHint);
2270- default :
2271- llvm_unreachable (" Invalid Dimension in im2col mode for "
2272- " GetCpAsyncBulkTensorPrefetchOpcode." );
2273- }
2274- } else {
2275- switch (Dim) {
2276- case 1 :
2277- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (1D, TILE, IsCacheHint);
2278- case 2 :
2279- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (2D, TILE, IsCacheHint);
2280- case 3 :
2281- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, TILE, IsCacheHint);
2282- case 4 :
2283- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, TILE, IsCacheHint);
2284- case 5 :
2285- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, TILE, IsCacheHint);
2286- default :
2287- llvm_unreachable (" Invalid Dimension in tile mode for "
2288- " GetCpAsyncBulkTensorPrefetchOpcode." );
2289- }
2290- }
2291- }
2292-
22932250static size_t GetDimsFromIntrinsic (unsigned IID) {
22942251 switch (IID) {
22952252 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
@@ -2354,52 +2311,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
23542311 ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
23552312}
23562313
2357- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon (SDNode *N,
2358- bool IsIm2Col) {
2359- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2360- // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2361- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2362- // = {2} + {4 + dims}
2363- size_t NumOps = N->getNumOperands ();
2364- size_t NumDims = NumOps - 6 ;
2365- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2366- size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2 ); // src, dst, cache_hint
2367-
2368- SDLoc DL (N);
2369- SmallVector<SDValue, 8 > Ops (N->ops ().slice (2 , NumArgs));
2370- Ops.push_back (N->getOperand (0 )); // Chain operand
2371-
2372- bool IsShared32 =
2373- CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2374- unsigned Opcode =
2375- GetCpAsyncBulkTensorS2GOpcode (NumDims, IsShared32, IsCacheHint, IsIm2Col);
2376- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2377- }
2378-
2379- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon (SDNode *N,
2380- bool IsIm2Col) {
2381- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2382- // {src, dims{d0...dN}, im2col_offsets{dims-2}
2383- // cache_hint, cache_hint_flag}
2384- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2385- // = {2} + {3 + dims + im2col_offsets}
2386- size_t NumOps = N->getNumOperands ();
2387- size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic (N->getConstantOperandVal (1 ))
2388- : (NumOps - 5 );
2389- // Offsets is always 'NumDims - 2' and only for im2col mode
2390- size_t NumOffsets = IsIm2Col ? (NumDims - 2 ) : 0 ;
2391- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2392- size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1 );
2393-
2394- SDLoc DL (N);
2395- SmallVector<SDValue, 12 > Ops (N->ops ().slice (2 , NumArgs));
2396- Ops.push_back (N->getOperand (0 )); // Chain operand
2397-
2398- unsigned Opcode =
2399- GetCpAsyncBulkTensorPrefetchOpcode (NumDims, IsCacheHint, IsIm2Col);
2400- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2401- }
2402-
24032314void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon (SDNode *N,
24042315 unsigned RedOp,
24052316 bool IsIm2Col) {
@@ -2419,8 +2330,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
24192330
24202331 bool IsShared32 =
24212332 CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2422- unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode (
2423- NumDims, IsShared32, IsCacheHint, IsIm2Col, /* IsReduce= */ true );
2333+ unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode (
2334+ NumDims, IsShared32, IsCacheHint, IsIm2Col);
24242335 ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
24252336}
24262337
@@ -2540,18 +2451,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25402451 switch (IID) {
25412452 default :
25422453 return false ;
2543- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2544- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2545- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2546- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2547- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2548- SelectCpAsyncBulkTensorS2GCommon (N);
2549- return true ;
2550- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2551- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2552- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2553- SelectCpAsyncBulkTensorS2GCommon (N, /* IsIm2Col=*/ true );
2554- return true ;
25552454 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
25562455 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
25572456 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
@@ -2564,18 +2463,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25642463 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
25652464 SelectCpAsyncBulkTensorG2SCommon (N, /* IsIm2Col=*/ true );
25662465 return true ;
2567- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2568- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2569- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2570- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2571- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2572- SelectCpAsyncBulkTensorPrefetchCommon (N);
2573- return true ;
2574- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2575- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2576- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2577- SelectCpAsyncBulkTensorPrefetchCommon (N, /* IsIm2Col=*/ true );
2578- return true ;
25792466 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
25802467 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
25812468 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
0 commit comments