Skip to content
This repository was archived by the owner on Jan 26, 2024. It is now read-only.

Commit 4ab5dbd

Browse files
Fix a crash when printf used in a kernel launched on a stream with custom CU mask
SWDEV-249719 - root cause: queues with custom CU mask are not inserted into queuePool_ (i.e., queue of reusable HSA queues) of ROC device class causing a crash when creating hostcall buffers for printf Change-Id: Ieee7005d9a5a30b3113394ce23ee65927126d0d6
1 parent 334aa87 commit 4ab5dbd

File tree

3 files changed

+27
-12
lines changed

3 files changed

+27
-12
lines changed

device/rocm/rocdevice.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ Device::Device(hsa_agent_t bkendDevice)
143143
, hsa_exclusive_gpu_access_(false)
144144
, queuePool_(QueuePriority::Total)
145145
, coopHostcallBuffer_(nullptr)
146+
, queueWithCUMaskPool_(QueuePriority::Total)
146147
, numOfVgpus_(0) {
147148
group_segment_.handle = 0;
148149
system_segment_.handle = 0;
@@ -2300,7 +2301,13 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
23002301
hsa_queue_destroy(queue);
23012302
return nullptr;
23022303
}
2303-
// Skip queue recycling for queues with custom CU mask
2304+
// add queues with custom CU mask into their special pool to keep track
2305+
// of mapping of these queues to their associated queueInfo (i.e., hostcall buffers)
2306+
auto result = queueWithCUMaskPool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
2307+
assert(result.second && "QueueInfo already exists");
2308+
auto &qInfo = result.first->second;
2309+
qInfo.refCount = 1;
2310+
23042311
return queue;
23052312
}
23062313
if (coop_queue) {
@@ -2315,8 +2322,8 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
23152322
return queue;
23162323
}
23172324

2318-
void Device::releaseQueue(hsa_queue_t* queue) {
2319-
for (auto& it : queuePool_) {
2325+
void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask) {
2326+
for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
23202327
auto qIter = it.find(queue);
23212328
if (qIter != it.end()) {
23222329
auto &qInfo = qIter->second;
@@ -2345,18 +2352,22 @@ void Device::releaseQueue(hsa_queue_t* queue) {
23452352
hsa_queue_destroy(queue);
23462353
}
23472354

2348-
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue) {
2355+
void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue,
2356+
const std::vector<uint32_t>& cuMask) {
23492357
decltype(queuePool_)::value_type::iterator qIter;
23502358

23512359
if (!coop_queue) {
2352-
for (auto &it : queuePool_) {
2360+
for (auto &it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
23532361
qIter = it.find(queue);
23542362
if (qIter != it.end()) {
23552363
break;
23562364
}
23572365
}
2358-
2359-
assert(qIter != queuePool_[QueuePriority::High].end());
2366+
if (cuMask.size() == 0) {
2367+
assert(qIter != queuePool_[QueuePriority::High].end());
2368+
} else {
2369+
assert(qIter != queueWithCUMaskPool_[QueuePriority::High].end());
2370+
}
23602371

23612372
if (qIter->second.hostcallBuffer_) {
23622373
return qIter->second.hostcallBuffer_;

device/rocm/rocdevice.hpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -459,11 +459,12 @@ class Device : public NullDevice {
459459
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
460460

461461
//! Release HSA queue
462-
void releaseQueue(hsa_queue_t*);
462+
void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {});
463463

464464
//! For the given HSA queue, return an existing hostcall buffer or create a
465465
//! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
466-
void* getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue = false);
466+
void* getOrCreateHostcallBuffer(hsa_queue_t* queue, bool coop_queue = false,
467+
const std::vector<uint32_t>& cuMask = {});
467468

468469
//! Return multi GPU grid launch sync buffer
469470
address MGSync() const { return mg_sync_; }
@@ -524,7 +525,7 @@ class Device : public NullDevice {
524525
void* hostcallBuffer_;
525526
};
526527

527-
//!< a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
528+
//! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
528529
std::vector<std::map<hsa_queue_t*, QueueInfo>> queuePool_;
529530

530531
//! returns a hsa queue from queuePool with least refCount and updates the refCount as well
@@ -535,6 +536,9 @@ class Device : public NullDevice {
535536
virtual bool findLinkInfo(const hsa_amd_memory_pool_t& pool,
536537
std::vector<LinkAttrType>* link_attr);
537538

539+
//! Pool of HSA queues with custom CU masks
540+
std::vector<std::map<hsa_queue_t*, QueueInfo>> queueWithCUMaskPool_;
541+
538542
public:
539543
std::atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
540544

device/rocm/rocvirtual.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -761,7 +761,7 @@ VirtualGPU::~VirtualGPU() {
761761
}
762762

763763
if (gpu_queue_) {
764-
roc_device_.releaseQueue(gpu_queue_);
764+
roc_device_.releaseQueue(gpu_queue_, cuMask_);
765765
}
766766
}
767767

@@ -2238,7 +2238,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
22382238
}
22392239
case amd::KernelParameterDescriptor::HiddenHostcallBuffer: {
22402240
if (amd::IS_HIP) {
2241-
auto buffer = roc_device_.getOrCreateHostcallBuffer(gpu_queue_, coopGroups);
2241+
auto buffer = roc_device_.getOrCreateHostcallBuffer(gpu_queue_, coopGroups, cuMask_);
22422242
if (!buffer) {
22432243
ClPrint(amd::LOG_ERROR, amd::LOG_KERN,
22442244
"Kernel expects a hostcall buffer, but none found");

0 commit comments

Comments
 (0)