@@ -61,16 +61,23 @@ min_command(uint slot_num, __global AmdAqlWrap* wraps)
6161 return minCommand ;
6262}
6363
64+ static inline bool
65+ check_pcie_support (__global SchedulerParam * param ) {
66+ #define kInvalidWriteIndex (ulong)(-1)
67+ return (param -> write_index == kInvalidWriteIndex ) ? true : false;
68+ }
69+
6470static inline void
6571EnqueueDispatch (__global hsa_kernel_dispatch_packet_t * aqlPkt , __global SchedulerParam * param )
6672{
6773 __global hsa_queue_t * child_queue = param -> child_queue ;
6874
69-
70- // ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed);
71- // The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global
72- // for workaround.
73- ulong index = atomic_fetch_add_explicit ((__global atomic_ulong * )& param -> write_index , (ulong )1 , memory_order_relaxed , memory_scope_device );
75+ ulong index ;
76+ if (check_pcie_support (param )) {
77+ index = __ockl_hsa_queue_add_write_index (child_queue , 1 , __ockl_memory_order_relaxed );
78+ } else {
79+ index = atomic_fetch_add_explicit ((__global atomic_ulong * )& param -> write_index , (ulong )1 , memory_order_relaxed , memory_scope_device );
80+ }
7481
7582 const ulong queueMask = child_queue -> size - 1 ;
7683 __global hsa_kernel_dispatch_packet_t * dispatch_packet = & (((__global hsa_kernel_dispatch_packet_t * )(child_queue -> base_address ))[index & queueMask ]);
@@ -82,17 +89,20 @@ EnqueueScheduler(__global SchedulerParam* param)
8289{
8390 __global hsa_queue_t * child_queue = param -> child_queue ;
8491
85- // ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed);
86- // The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global
87- // for workaround.
88- ulong index = atomic_fetch_add_explicit ((__global atomic_ulong * )& param -> write_index , (ulong )1 , memory_order_relaxed , memory_scope_device );
92+ ulong index ;
93+ if (check_pcie_support (param )) {
94+ index = __ockl_hsa_queue_add_write_index (child_queue , 1 , __ockl_memory_order_relaxed );
95+ } else {
96+ index = atomic_fetch_add_explicit ((__global atomic_ulong * )& param -> write_index , (ulong )1 , memory_order_relaxed , memory_scope_device );
97+ }
8998
9099 const ulong queueMask = child_queue -> size - 1 ;
91100 __global hsa_kernel_dispatch_packet_t * dispatch_packet = & (((__global hsa_kernel_dispatch_packet_t * )(child_queue -> base_address ))[index & queueMask ]);
92101 * dispatch_packet = param -> scheduler_aql ;
93102
94- // This is part of the PCIe 3 atomics workaround, to write the final write_index value back to the child_queue
95- __ockl_hsa_queue_store_write_index (child_queue , index + 1 , __ockl_memory_order_relaxed );
103+ if (!check_pcie_support (param )) {
104+ __ockl_hsa_queue_store_write_index (child_queue , index + 1 , __ockl_memory_order_relaxed );
105+ }
96106
97107 __ockl_hsa_signal_store (child_queue -> doorbell_signal , index , __ockl_memory_order_release );
98108}
0 commit comments