Skip to content

Commit 9e33f48

Browse files
authored
SWDEV-465041 - Enable queue write index programming (llvm#2404)
The workaround will be active only if the system doesn't have pcie atomics
1 parent de56c73 commit 9e33f48

File tree

1 file changed

+21
-11
lines changed

1 file changed

+21
-11
lines changed

amd/device-libs/opencl/src/devenq/schedule_rocm.cl

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,23 @@ min_command(uint slot_num, __global AmdAqlWrap* wraps)
6161
return minCommand;
6262
}
6363

64+
static inline bool
65+
check_pcie_support(__global SchedulerParam* param) {
66+
#define kInvalidWriteIndex (ulong)(-1)
67+
return (param->write_index == kInvalidWriteIndex) ? true : false;
68+
}
69+
6470
static inline void
6571
EnqueueDispatch(__global hsa_kernel_dispatch_packet_t* aqlPkt, __global SchedulerParam* param)
6672
{
6773
__global hsa_queue_t* child_queue = param->child_queue;
6874

69-
70-
// ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed);
71-
// The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global
72-
// for workaround.
73-
ulong index = atomic_fetch_add_explicit((__global atomic_ulong*)&param->write_index, (ulong)1, memory_order_relaxed, memory_scope_device);
75+
ulong index;
76+
if (check_pcie_support(param)) {
77+
index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed);
78+
} else {
79+
index = atomic_fetch_add_explicit((__global atomic_ulong*)&param->write_index, (ulong)1, memory_order_relaxed, memory_scope_device);
80+
}
7481

7582
const ulong queueMask = child_queue->size - 1;
7683
__global hsa_kernel_dispatch_packet_t* dispatch_packet = &(((__global hsa_kernel_dispatch_packet_t*)(child_queue->base_address))[index & queueMask]);
@@ -82,17 +89,20 @@ EnqueueScheduler(__global SchedulerParam* param)
8289
{
8390
__global hsa_queue_t* child_queue = param->child_queue;
8491

85-
// ulong index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed);
86-
// The original code seen above relies on PCIe 3 atomics, which might not be supported on some systems, so use a device side global
87-
// for workaround.
88-
ulong index = atomic_fetch_add_explicit((__global atomic_ulong*)&param->write_index, (ulong)1, memory_order_relaxed, memory_scope_device);
92+
ulong index;
93+
if (check_pcie_support(param)) {
94+
index = __ockl_hsa_queue_add_write_index(child_queue, 1, __ockl_memory_order_relaxed);
95+
} else {
96+
index = atomic_fetch_add_explicit((__global atomic_ulong*)&param->write_index, (ulong)1, memory_order_relaxed, memory_scope_device);
97+
}
8998

9099
const ulong queueMask = child_queue->size - 1;
91100
__global hsa_kernel_dispatch_packet_t* dispatch_packet = &(((__global hsa_kernel_dispatch_packet_t*)(child_queue->base_address))[index & queueMask]);
92101
*dispatch_packet = param->scheduler_aql;
93102

94-
// This is part of the PCIe 3 atomics workaround, to write the final write_index value back to the child_queue
95-
__ockl_hsa_queue_store_write_index(child_queue, index + 1, __ockl_memory_order_relaxed);
103+
if (!check_pcie_support(param)) {
104+
__ockl_hsa_queue_store_write_index(child_queue, index + 1, __ockl_memory_order_relaxed);
105+
}
96106

97107
__ockl_hsa_signal_store(child_queue->doorbell_signal, index, __ockl_memory_order_release);
98108
}

0 commit comments

Comments
 (0)