@@ -836,21 +836,23 @@ struct AMDGPUKernelTy : public GenericKernelTy {
836836 }
837837
838838 // / Launch the AMDGPU kernel function.
839- Error launchImpl (GenericDeviceTy &GenericDevice, uint32_t NumThreads,
840- uint64_t NumBlocks, KernelArgsTy &KernelArgs,
839+ Error launchImpl (GenericDeviceTy &GenericDevice, uint32_t NumThreads[ 3 ] ,
840+ uint32_t NumBlocks[ 3 ] , KernelArgsTy &KernelArgs,
841841 KernelLaunchParamsTy LaunchParams,
842842 AsyncInfoWrapperTy &AsyncInfoWrapper) const override ;
843843
844844 // / Print more elaborate kernel launch info for AMDGPU
845845 Error printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
846- KernelArgsTy &KernelArgs, uint32_t NumThreads,
847- uint64_t NumBlocks, int64_t MultiDeviceLB,
846+ KernelArgsTy &KernelArgs, uint32_t NumThreads[ 3 ] ,
847+ uint32_t NumBlocks[ 3 ] , int64_t MultiDeviceLB,
848848 int64_t MultiDeviceUB) const override ;
849849 // / Print the "old" AMD KernelTrace single-line format
850850 void printAMDOneLineKernelTrace (GenericDeviceTy &GenericDevice,
851- KernelArgsTy &KernelArgs, uint32_t NumThreads,
852- uint64_t NumBlocks, int64_t MultiDeviceLB,
851+ KernelArgsTy &KernelArgs,
852+ uint32_t NumThreads[3 ], uint32_t NumBlocks[3 ],
853+ int64_t MultiDeviceLB,
853854 int64_t MultiDeviceUB) const ;
855+
854856 // / Get group and private segment kernel size.
855857 uint32_t getGroupSize () const { return GroupSize; }
856858 uint32_t getPrivateSize () const { return PrivateSize; }
@@ -976,7 +978,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
976978 // / user-defined threads and block clauses.
977979 uint32_t getNumThreads (GenericDeviceTy &GenericDevice,
978980 uint32_t ThreadLimitClause[3 ]) const override {
979- assert (ThreadLimitClause[1 ] == 0 && ThreadLimitClause[2 ] == 0 &&
981+ assert (ThreadLimitClause[1 ] == 1 && ThreadLimitClause[2 ] == 1 &&
980982 " Multi dimensional launch not supported yet." );
981983
982984 // Honor OMP_TEAMS_THREAD_LIMIT environment variable and
@@ -997,7 +999,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
997999 TeamsThreadLimitEnvVar <= static_cast <int32_t >(ConstWGSize))
9981000 return llvm::omp::getBlockSizeAsPowerOfTwo (TeamsThreadLimitEnvVar);
9991001 if (ThreadLimitClause[0 ] > 0 && ThreadLimitClause[0 ] != (uint32_t )-1 &&
1000- ThreadLimitClause[0 ] <= static_cast <int32_t >(ConstWGSize))
1002+ ThreadLimitClause[0 ] <= static_cast <uint32_t >(ConstWGSize))
10011003 return llvm::omp::getBlockSizeAsPowerOfTwo (ThreadLimitClause[0 ]);
10021004 assert (((ConstWGSize & (ConstWGSize - 1 )) == 0 ) &&
10031005 " XTeam Reduction blocksize must be a power of two" );
@@ -1022,11 +1024,11 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10221024 ? ThreadLimitClause[0 ]
10231025 : PreferredNumThreads);
10241026 }
1025- uint64_t getNumBlocks (GenericDeviceTy &GenericDevice,
1027+ uint32_t getNumBlocks (GenericDeviceTy &GenericDevice,
10261028 uint32_t NumTeamsClause[3 ], uint64_t LoopTripCount,
10271029 uint32_t &NumThreads,
10281030 bool IsNumThreadsFromUser) const override {
1029- assert (NumTeamsClause[1 ] == 0 && NumTeamsClause[2 ] == 0 &&
1031+ assert (NumTeamsClause[1 ] == 1 && NumTeamsClause[2 ] == 1 &&
10301032 " Multi dimensional launch not supported yet." );
10311033
10321034 const auto getNumGroupsFromThreadsAndTripCount =
@@ -1062,7 +1064,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10621064 getNumGroupsFromThreadsAndTripCount (LoopTripCount, NumThreads);
10631065
10641066 // Honor OMP_NUM_TEAMS environment variable for BigJumpLoop kernel type.
1065- if (NumTeamsEnvVar > 0 && NumTeamsEnvVar <= GenericDevice.getBlockLimit ())
1067+ if (NumTeamsEnvVar > 0 && static_cast <uint32_t >(NumTeamsEnvVar) <=
1068+ GenericDevice.getBlockLimit ())
10661069 NumGroups = std::min (static_cast <uint64_t >(NumTeamsEnvVar), NumGroups);
10671070 // Honor num_teams clause but lower it if tripcount dictates.
10681071 else if (NumTeamsClause[0 ] > 0 &&
@@ -1145,8 +1148,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11451148 NumTeamsClause[0 ] <= GenericDevice.getBlockLimit ()) {
11461149 NumGroups =
11471150 std::min (static_cast <uint64_t >(NumTeamsClause[0 ]), MaxNumGroups);
1148- } else if (NumTeamsEnvVar > 0 &&
1149- NumTeamsEnvVar <= GenericDevice.getBlockLimit ()) {
1151+ } else if (NumTeamsEnvVar > 0 && static_cast < uint32_t >(NumTeamsEnvVar) <=
1152+ GenericDevice.getBlockLimit ()) {
11501153 NumGroups =
11511154 std::min (static_cast <uint64_t >(NumTeamsEnvVar), MaxNumGroups);
11521155 } else {
@@ -1462,8 +1465,8 @@ struct AMDGPUQueueTy {
14621465 // / Push a kernel launch to the queue. The kernel launch requires an output
14631466 // / signal and can define an optional input signal (nullptr if none).
14641467 Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
1465- uint32_t NumThreads, uint64_t NumBlocks,
1466- uint32_t GroupSize, uint32_t StackSize,
1468+ uint32_t NumThreads[ 3 ], uint32_t NumBlocks[ 3 ] ,
1469+ uint32_t GroupSize, uint64_t StackSize,
14671470 AMDGPUSignalTy *OutputSignal,
14681471 AMDGPUSignalTy *InputSignal) {
14691472 assert (OutputSignal && " Invalid kernel output signal" );
@@ -1489,17 +1492,23 @@ struct AMDGPUQueueTy {
14891492 assert (Packet && " Invalid packet" );
14901493
14911494 // The first 32 bits of the packet are written after the other fields
1492- uint16_t Setup = UINT16_C (1 ) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1493- Packet->workgroup_size_x = NumThreads;
1494- Packet->workgroup_size_y = 1 ;
1495- Packet->workgroup_size_z = 1 ;
1495+ uint16_t Dims = NumBlocks[2 ] * NumThreads[2 ] > 1
1496+ ? 3
1497+ : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 );
1498+ uint16_t Setup = UINT16_C (Dims)
1499+ << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1500+ Packet->workgroup_size_x = NumThreads[0 ];
1501+ Packet->workgroup_size_y = NumThreads[1 ];
1502+ Packet->workgroup_size_z = NumThreads[2 ];
14961503 Packet->reserved0 = 0 ;
1497- Packet->grid_size_x = NumBlocks * NumThreads;
1498- Packet->grid_size_y = 1 ;
1499- Packet->grid_size_z = 1 ;
1504+ Packet->grid_size_x = NumBlocks[ 0 ] * NumThreads[ 0 ] ;
1505+ Packet->grid_size_y = NumBlocks[ 1 ] * NumThreads[ 1 ] ;
1506+ Packet->grid_size_z = NumBlocks[ 2 ] * NumThreads[ 2 ] ;
15001507 Packet->private_segment_size =
1501- Kernel.usesDynamicStack () ? std::max (Kernel.getPrivateSize (), StackSize)
1502- : Kernel.getPrivateSize ();
1508+ Kernel.usesDynamicStack ()
1509+ ? std::max (static_cast <uint64_t >(Kernel.getPrivateSize ()),
1510+ StackSize)
1511+ : Kernel.getPrivateSize ();
15031512 Packet->group_segment_size = GroupSize;
15041513 Packet->kernel_object = Kernel.getKernelObject ();
15051514 Packet->kernarg_address = KernelArgs;
@@ -2117,8 +2126,9 @@ struct AMDGPUStreamTy {
21172126 // / the kernel args buffer to the specified memory manager.
21182127 Error
21192128 pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
2120- uint32_t NumThreads, uint64_t NumBlocks, uint32_t GroupSize,
2121- uint32_t StackSize, AMDGPUMemoryManagerTy &MemoryManager,
2129+ uint32_t NumThreads[3 ], uint32_t NumBlocks[3 ],
2130+ uint32_t GroupSize, uint32_t StackSize,
2131+ AMDGPUMemoryManagerTy &MemoryManager,
21222132 std::unique_ptr<ompt::OmptEventInfoTy> OmptInfo = nullptr ) {
21232133 if (Queue == nullptr )
21242134 return Plugin::error (" Target queue was nullptr" );
@@ -4222,10 +4232,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
42224232 AsyncInfoWrapperTy AsyncInfoWrapper (*this , nullptr );
42234233
42244234 KernelArgsTy KernelArgs = {};
4225- if ( auto Err =
4226- AMDGPUKernel.launchImpl (* this , /* NumThread= */ 1u ,
4227- /* NumBlocks= */ 1ul , KernelArgs,
4228- KernelLaunchParamsTy{}, AsyncInfoWrapper))
4235+ uint32_t NumBlocksAndThreads[ 3 ] = { 1u , 1u , 1u };
4236+ if ( auto Err = AMDGPUKernel.launchImpl (
4237+ * this , NumBlocksAndThreads, NumBlocksAndThreads , KernelArgs,
4238+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
42294239 return Err;
42304240
42314241 Error Err = Plugin::success ();
@@ -4960,7 +4970,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
49604970};
49614971
49624972Error AMDGPUKernelTy::launchImpl (GenericDeviceTy &GenericDevice,
4963- uint32_t NumThreads, uint64_t NumBlocks,
4973+ uint32_t NumThreads[ 3 ], uint32_t NumBlocks[ 3 ] ,
49644974 KernelArgsTy &KernelArgs,
49654975 KernelLaunchParamsTy LaunchParams,
49664976 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -5041,13 +5051,15 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
50415051 if (ImplArgs &&
50425052 getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
50435053 DP (" Setting fields of ImplicitArgs for COV5\n " );
5044- ImplArgs->BlockCountX = NumBlocks;
5045- ImplArgs->BlockCountY = 1 ;
5046- ImplArgs->BlockCountZ = 1 ;
5047- ImplArgs->GroupSizeX = NumThreads;
5048- ImplArgs->GroupSizeY = 1 ;
5049- ImplArgs->GroupSizeZ = 1 ;
5050- ImplArgs->GridDims = 1 ;
5054+ ImplArgs->BlockCountX = NumBlocks[0 ];
5055+ ImplArgs->BlockCountY = NumBlocks[1 ];
5056+ ImplArgs->BlockCountZ = NumBlocks[2 ];
5057+ ImplArgs->GroupSizeX = NumThreads[0 ];
5058+ ImplArgs->GroupSizeY = NumThreads[1 ];
5059+ ImplArgs->GroupSizeZ = NumThreads[2 ];
5060+ ImplArgs->GridDims = NumBlocks[2 ] * NumThreads[2 ] > 1
5061+ ? 3
5062+ : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 );
50515063 ImplArgs->HeapV1Ptr =
50525064 (uint64_t )AMDGPUDevice.getPreAllocatedDeviceMemoryPool ();
50535065 ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem ;
@@ -5065,8 +5077,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
50655077
50665078void AMDGPUKernelTy::printAMDOneLineKernelTrace (GenericDeviceTy &GenericDevice,
50675079 KernelArgsTy &KernelArgs,
5068- uint32_t NumThreads,
5069- uint64_t NumBlocks,
5080+ uint32_t NumThreads[ 3 ] ,
5081+ uint32_t NumBlocks[ 3 ] ,
50705082 int64_t MultiDeviceLB,
50715083 int64_t MultiDeviceUB) const {
50725084 auto GroupSegmentSize = (*KernelInfo).GroupSegmentList ;
@@ -5084,17 +5096,17 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
50845096 " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
50855097 " %d%% n:%s\n " ,
50865098 GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5087- KernelArgs.NumArgs , NumBlocks, NumThreads, 0 , 0 , GroupSegmentSize ,
5088- SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount ,
5089- KernelArgs.Tripcount , NeedsHostServices, isMultiDeviceKernel () ,
5090- MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy ,
5091- getName ());
5099+ KernelArgs.NumArgs , NumBlocks[ 0 ] , NumThreads[ 0 ] , 0 , 0 ,
5100+ GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount,
5101+ VGPRSpillCount, KernelArgs.Tripcount , NeedsHostServices,
5102+ isMultiDeviceKernel (), MultiDeviceLB, MultiDeviceUB, MaxOccupancy,
5103+ AchievedOccupancy, getName ());
50925104}
50935105
50945106Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
50955107 KernelArgsTy &KernelArgs,
5096- uint32_t NumThreads,
5097- uint64_t NumBlocks,
5108+ uint32_t NumThreads[ 3 ] ,
5109+ uint32_t NumBlocks[ 3 ] ,
50985110 int64_t MultiDeviceLB,
50995111 int64_t MultiDeviceUB) const {
51005112 // When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
@@ -5140,12 +5152,13 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
51405152 // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
51415153 // Tripcount: loop tripcount for the kernel
51425154 INFO (OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId (),
5143- " #Args: %d Teams x Thrds: %4lux %4u (MaxFlatWorkGroupSize: %u) LDS "
5155+ " #Args: %d Teams x Thrds: %4ux %4u (MaxFlatWorkGroupSize: %u) LDS "
51445156 " Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
51455157 " %lu\n " ,
5146- ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
5147- GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
5148- LoopTripCount);
5158+ ArgNum, NumGroups[0 ] * NumGroups[1 ] * NumGroups[2 ],
5159+ ThreadsPerGroup[0 ] * ThreadsPerGroup[1 ] * ThreadsPerGroup[2 ],
5160+ MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
5161+ SGPRSpillCount, VGPRSpillCount, LoopTripCount);
51495162
51505163 return Plugin::success ();
51515164}
0 commit comments