Skip to content

Commit 09f3a2a

Browse files
Freehill, Chriscfreeamd
andauthored
rocr: Ensure AqlQueue can exit on memory error (#378)
A hang would occur when a memory error occurs because the AQLQueue destructor would be waiting for a signal that wouldn't come. This change allows it to break out of the wait loop. Co-authored-by: Chris Freehill <cfreehil@amd.com>
1 parent 737ba1d commit 09f3a2a

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,12 +1291,15 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
12911291

12921292
AqlQueue* queue = (AqlQueue*)arg;
12931293
hsa_status_t errorCode = HSA_STATUS_ERROR;
1294-
1295-
if (queue->exceptionState == ERROR_HANDLER_TERMINATE) {
1294+
auto exceptionHandlerDone = [&]() {
12961295
Signal* signal = queue->exception_signal_;
12971296
queue->exceptionState = ERROR_HANDLER_DONE;
12981297
signal->StoreRelease(0);
12991298
return false;
1299+
};
1300+
1301+
if (queue->exceptionState == ERROR_HANDLER_TERMINATE) {
1302+
return exceptionHandlerDone();
13001303
}
13011304

13021305
for (auto& error : QueueErrors) {
@@ -1313,7 +1316,7 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
13131316
// handler.
13141317
if (errorCode == static_cast<hsa_status_t>(HSA_STATUS_ERROR_MEMORY_FAULT)) {
13151318
debug_print("Queue error - HSA_STATUS_ERROR_MEMORY_FAULT\n");
1316-
return false;
1319+
return exceptionHandlerDone();
13171320
}
13181321

13191322
// Fallback if KFD does not support GPU core dump. In this case, there core dump is
@@ -1335,10 +1338,7 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
13351338
if (queue->errors_callback_ != nullptr) {
13361339
queue->errors_callback_(errorCode, queue->public_handle(), queue->errors_data_);
13371340
}
1338-
Signal* signal = queue->exception_signal_;
1339-
queue->exceptionState = ERROR_HANDLER_DONE;
1340-
signal->StoreRelease(0);
1341-
return false;
1341+
return exceptionHandlerDone();
13421342
}
13431343

13441344
hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t* cu_mask) {

0 commit comments

Comments
 (0)