@@ -711,7 +711,6 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
711711private:
712712 // / The executable loaded on the agent.
713713 hsa_executable_t Executable;
714- hsa_code_object_t CodeObject;
715714#if SANITIZER_AMDGPU
716715 hsa_code_object_reader_t CodeObjectReader;
717716#endif
@@ -1715,8 +1714,8 @@ struct AMDGPUStreamTy {
17151714
17161715 // / Create an empty slot.
17171716 StreamSlotTy ()
1718- : Signal(nullptr ), Callbacks({}), ActionArgs({} ),
1719- OmptActionFunction ( nullptr ) {}
1717+ : Signal(nullptr ), Callbacks({}), OmptActionFunction( nullptr ),
1718+ ActionArgs ({} ) {}
17201719
17211720 // / Schedule a host memory copy action on the slot.
17221721 Error schedHostMemoryCopy (void *Dst, const void *Src, size_t Size) {
@@ -1864,6 +1863,10 @@ struct AMDGPUStreamTy {
18641863 // / Use synchronous copy back.
18651864 bool UseSyncCopyBack;
18661865
1866+ // / When copying data from one host buffer to another, only do it
1867+ // / asynchronously if `MinHostToHostAsyncCopySize <= size`.
1868+ UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1869+
18671870 // / Arguments for the callback function.
18681871 PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
18691872
@@ -2031,8 +2034,8 @@ struct AMDGPUStreamTy {
20312034 assert (Args->Signal &&
20322035 " Invalid AMDGPUSignal Pointer in post kernel run processing" );
20332036 hsa_amd_profiling_dispatch_time_t TimeRec;
2034- hsa_status_t Status = hsa_amd_profiling_get_dispatch_time (
2035- Args-> Agent , Args-> Signal -> get (), &TimeRec);
2037+ hsa_amd_profiling_get_dispatch_time (Args-> Agent , Args-> Signal -> get (),
2038+ &TimeRec);
20362039
20372040 uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
20382041 uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
@@ -2288,6 +2291,14 @@ struct AMDGPUStreamTy {
22882291 return Err;
22892292 }
22902293
2294+ if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2295+ if (auto Err =
2296+ OutputSignals[0 ]->wait (StreamBusyWaitMicroseconds, &Device))
2297+ return Err;
2298+ std::memcpy (Dst, Inter, CopySize);
2299+ return Error::success ();
2300+ }
2301+
22912302 // Consume another stream slot and compute dependencies.
22922303 std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
22932304 assert (InputSignal && " Invalid input signal" );
@@ -2919,7 +2930,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29192930 " OMPX_ENABLE_GFX90A_COARSE_GRAIN_SHARED_ALLOC" , false ),
29202931 OMPX_StrictSanityChecks (" OMPX_STRICT_SANITY_CHECKS" , false ),
29212932 OMPX_SyncCopyBack (" LIBOMPTARGET_SYNC_COPY_BACK" , true ),
2922- OMPX_APUPrefaultMemcopy (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY" , " true" ),
2933+ OMPX_APUPrefaultMemcopy (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY" , true ),
29232934 OMPX_APUPrefaultMemcopySize (" LIBOMPTARGET_APU_PREFAULT_MEMCOPY_SIZE" ,
29242935 1 * 1024 * 1024 ), // 1MB
29252936 OMPX_DGPUMaps (" OMPX_DGPU_MAPS" , false ),
@@ -3892,6 +3903,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
38923903 case HSA_DEVICE_TYPE_DSP:
38933904 TmpCharPtr = " DSP" ;
38943905 break ;
3906+ case HSA_DEVICE_TYPE_AIE:
3907+ TmpCharPtr = " AIE" ;
3908+ break ;
38953909 }
38963910 Info.add (" Device Type" , TmpCharPtr);
38973911 }
@@ -4683,7 +4697,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
46834697 Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
46844698 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
46854699 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4686- UseSyncCopyBack(Device.syncCopyBack()) {}
4700+ UseSyncCopyBack(Device.syncCopyBack()),
4701+ OMPX_MinHostToHostAsyncCopySize(
4702+ " LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE" , 2048 ) {}
46874703
46884704// / Class implementing the AMDGPU-specific functionalities of the global
46894705// / handler.
@@ -5066,7 +5082,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
50665082 if (LaunchParams.Size )
50675083 std::memcpy (AllArgs, LaunchParams.Data , LaunchParams.Size );
50685084
5069- uint64_t Buffer = 0 ;
50705085 AMDGPUDeviceTy &AMDGPUDevice = static_cast <AMDGPUDeviceTy &>(GenericDevice);
50715086 AMDGPUStreamTy *Stream = nullptr ;
50725087 if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
@@ -5117,7 +5132,7 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
51175132 // This line should print exactly as the one in the old plugin.
51185133 fprintf (
51195134 stderr,
5120- " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4luX %4d) "
5135+ " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX %4d) "
51215136 " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
51225137 " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
51235138 " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
@@ -5310,8 +5325,8 @@ static std::pair<uint64_t, uint64_t>
53105325getKernelStartAndEndTime (const OmptKernelTimingArgsAsyncTy *Args) {
53115326 assert (Args->Signal && " Invalid AMDGPUSignal Pointer in OMPT profiling" );
53125327 hsa_amd_profiling_dispatch_time_t TimeRec;
5313- hsa_status_t Status = hsa_amd_profiling_get_dispatch_time (
5314- Args-> Agent , Args-> Signal -> get (), &TimeRec);
5328+ hsa_amd_profiling_get_dispatch_time (Args-> Agent , Args-> Signal -> get (),
5329+ &TimeRec);
53155330
53165331 uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
53175332 uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
@@ -5323,8 +5338,7 @@ static std::pair<uint64_t, uint64_t>
53235338getCopyStartAndEndTime (const OmptKernelTimingArgsAsyncTy *Args) {
53245339 assert (Args->Signal && " Invalid AMDGPUSignal Pointer in OMPT profiling" );
53255340 hsa_amd_profiling_async_copy_time_t TimeRec;
5326- hsa_status_t Status =
5327- hsa_amd_profiling_get_async_copy_time (Args->Signal ->get (), &TimeRec);
5341+ hsa_amd_profiling_get_async_copy_time (Args->Signal ->get (), &TimeRec);
53285342 uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
53295343 uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
53305344
0 commit comments