Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ llvm/include/llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h @intel/llvm-reviewers-
llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp @intel/llvm-reviewers-cuda
llvm/lib/Target/NVPTX @intel/llvm-reviewers-cuda
llvm/lib/Target/AMDGPU @intel/llvm-reviewers-cuda
unified-runtime/source/common/cuda-hip @intel/llvm-reviewers-cuda

# XPTI instrumentation utilities
xpti/ @intel/llvm-reviewers-runtime
Expand Down
2 changes: 1 addition & 1 deletion unified-runtime/source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp(
std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down
8 changes: 4 additions & 4 deletions unified-runtime/source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
try {
ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
{
Expand Down Expand Up @@ -440,7 +440,7 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,

ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down Expand Up @@ -628,7 +628,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(

ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down Expand Up @@ -1517,7 +1517,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
try {
ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
Expand Down
8 changes: 4 additions & 4 deletions unified-runtime/source/adapters/cuda/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,17 @@ bool ur_event_handle_t_::isCompleted() const noexcept try {

uint64_t ur_event_handle_t_::getQueuedTime() const {
assert(isStarted());
return Queue->get_device()->getElapsedTime(EvQueued);
return Queue->getDevice()->getElapsedTime(EvQueued);
}

uint64_t ur_event_handle_t_::getStartTime() const {
assert(isStarted());
return Queue->get_device()->getElapsedTime(EvStart);
return Queue->getDevice()->getElapsedTime(EvStart);
}

uint64_t ur_event_handle_t_::getEndTime() const {
assert(isStarted() && isRecorded());
return Queue->get_device()->getElapsedTime(EvEnd);
return Queue->getDevice()->getElapsedTime(EvEnd);
}

ur_result_t ur_event_handle_t_::record() {
Expand All @@ -111,7 +111,7 @@ ur_result_t ur_event_handle_t_::record() {
UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE);

try {
EventID = Queue->getNextEventID();
EventID = Queue->getNextEventId();
if (EventID == 0) {
die("Unrecoverable program state reached in event identifier overflow");
}
Expand Down
90 changes: 7 additions & 83 deletions unified-runtime/source/adapters/cuda/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,93 +32,17 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded(
}
}

CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
if (getThreadLocalStream() != CUstream{0})
return getThreadLocalStream();
uint32_t StreamI;
uint32_t Token;
while (true) {
if (NumComputeStreams < ComputeStreams.size()) {
// the check above is for performance - so as not to lock mutex every time
std::lock_guard<std::mutex> guard(ComputeStreamMutex);
// The second check is done after mutex is locked so other threads can not
// change NumComputeStreams after that
if (NumComputeStreams < ComputeStreams.size()) {
UR_CHECK_ERROR(cuStreamCreateWithPriority(
&ComputeStreams[NumComputeStreams], Flags, Priority));
++NumComputeStreams;
}
}
Token = ComputeStreamIndex++;
StreamI = Token % ComputeStreams.size();
// if a stream has been reused before it was next selected round-robin
// fashion, we want to delay its next use and instead select another one
// that is more likely to have completed all the enqueued work.
if (DelayCompute[StreamI]) {
DelayCompute[StreamI] = false;
} else {
break;
}
}
if (StreamToken) {
*StreamToken = Token;
}
CUstream res = ComputeStreams[StreamI];
computeStreamWaitForBarrierIfNeeded(res, StreamI);
return res;
ur_queue_handle_t ur_queue_handle_t_::getEventQueue(const ur_event_handle_t e) {
return e->getQueue();
}

CUstream ur_queue_handle_t_::getNextComputeStream(
uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
ur_stream_guard_ &Guard, uint32_t *StreamToken) {
if (getThreadLocalStream() != CUstream{0})
return getThreadLocalStream();
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
uint32_t Token = EventWaitList[i]->getComputeStreamToken();
if (reinterpret_cast<ur_queue_handle_t>(EventWaitList[i]->getQueue()) ==
this &&
canReuseStream(Token)) {
std::unique_lock<std::mutex> ComputeSyncGuard(ComputeStreamSyncMutex);
// redo the check after lock to avoid data races on
// LastSyncComputeStreams
if (canReuseStream(Token)) {
uint32_t StreamI = Token % DelayCompute.size();
DelayCompute[StreamI] = true;
if (StreamToken) {
*StreamToken = Token;
}
Guard = ur_stream_guard_{std::move(ComputeSyncGuard)};
CUstream Result = EventWaitList[i]->getStream();
computeStreamWaitForBarrierIfNeeded(Result, StreamI);
return Result;
}
}
}
Guard = {};
return getNextComputeStream(StreamToken);
uint32_t
ur_queue_handle_t_::getEventComputeStreamToken(const ur_event_handle_t e) {
return e->getComputeStreamToken();
}

CUstream ur_queue_handle_t_::getNextTransferStream() {
if (getThreadLocalStream() != CUstream{0})
return getThreadLocalStream();
if (TransferStreams.empty()) { // for example in in-order queue
return getNextComputeStream();
}
if (NumTransferStreams < TransferStreams.size()) {
// the check above is for performance - so as not to lock mutex every time
std::lock_guard<std::mutex> Guuard(TransferStreamMutex);
// The second check is done after mutex is locked so other threads can not
// change NumTransferStreams after that
if (NumTransferStreams < TransferStreams.size()) {
UR_CHECK_ERROR(cuStreamCreateWithPriority(
&TransferStreams[NumTransferStreams], Flags, Priority));
++NumTransferStreams;
}
}
uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size();
CUstream Result = TransferStreams[StreamI];
transferStreamWaitForBarrierIfNeeded(Result, StreamI);
return Result;
CUstream ur_queue_handle_t_::getEventStream(const ur_event_handle_t e) {
return e->getStream();
}

/// Creates a `ur_queue_handle_t` object on the CUDA backend.
Expand Down
Loading
Loading