Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ llvm/include/llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h @intel/llvm-reviewers-
llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp @intel/llvm-reviewers-cuda
llvm/lib/Target/NVPTX @intel/llvm-reviewers-cuda
llvm/lib/Target/AMDGPU @intel/llvm-reviewers-cuda
unified-runtime/source/common/cuda-hip @intel/llvm-reviewers-cuda

# XPTI instrumentation utilities
xpti/ @intel/llvm-reviewers-runtime
Expand Down
4 changes: 2 additions & 2 deletions unified-runtime/source/adapters/cuda/async_alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMDeviceAllocExp(

ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down Expand Up @@ -83,7 +83,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFreeExp(

ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down
2 changes: 1 addition & 1 deletion unified-runtime/source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp(
std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down
8 changes: 4 additions & 4 deletions unified-runtime/source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
try {
ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
{
Expand Down Expand Up @@ -440,7 +440,7 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,

ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down Expand Up @@ -628,7 +628,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(

ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

Expand Down Expand Up @@ -1517,7 +1517,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
try {
ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
ur_stream_guard Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
Expand Down
8 changes: 4 additions & 4 deletions unified-runtime/source/adapters/cuda/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,17 @@ bool ur_event_handle_t_::isCompleted() const noexcept try {

uint64_t ur_event_handle_t_::getQueuedTime() const {
assert(isStarted());
return Queue->get_device()->getElapsedTime(EvQueued);
return Queue->getDevice()->getElapsedTime(EvQueued);
}

uint64_t ur_event_handle_t_::getStartTime() const {
assert(isStarted());
return Queue->get_device()->getElapsedTime(EvStart);
return Queue->getDevice()->getElapsedTime(EvStart);
}

uint64_t ur_event_handle_t_::getEndTime() const {
assert(isStarted() && isRecorded());
return Queue->get_device()->getElapsedTime(EvEnd);
return Queue->getDevice()->getElapsedTime(EvEnd);
}

ur_result_t ur_event_handle_t_::record() {
Expand All @@ -111,7 +111,7 @@ ur_result_t ur_event_handle_t_::record() {
UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE);

try {
EventID = Queue->getNextEventID();
EventID = Queue->getNextEventId();
if (EventID == 0) {
die("Unrecoverable program state reached in event identifier overflow");
}
Expand Down
90 changes: 7 additions & 83 deletions unified-runtime/source/adapters/cuda/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,93 +32,17 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded(
}
}

CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
if (getThreadLocalStream() != CUstream{0})
return getThreadLocalStream();
uint32_t StreamI;
uint32_t Token;
while (true) {
if (NumComputeStreams < ComputeStreams.size()) {
// the check above is for performance - so as not to lock mutex every time
std::lock_guard<std::mutex> guard(ComputeStreamMutex);
// The second check is done after mutex is locked so other threads can not
// change NumComputeStreams after that
if (NumComputeStreams < ComputeStreams.size()) {
UR_CHECK_ERROR(cuStreamCreateWithPriority(
&ComputeStreams[NumComputeStreams], Flags, Priority));
++NumComputeStreams;
}
}
Token = ComputeStreamIndex++;
StreamI = Token % ComputeStreams.size();
// if a stream has been reused before it was next selected round-robin
// fashion, we want to delay its next use and instead select another one
// that is more likely to have completed all the enqueued work.
if (DelayCompute[StreamI]) {
DelayCompute[StreamI] = false;
} else {
break;
}
}
if (StreamToken) {
*StreamToken = Token;
}
CUstream res = ComputeStreams[StreamI];
computeStreamWaitForBarrierIfNeeded(res, StreamI);
return res;
ur_queue_handle_t ur_queue_handle_t_::getEventQueue(const ur_event_handle_t e) {
return e->getQueue();
}

CUstream ur_queue_handle_t_::getNextComputeStream(
uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
ur_stream_guard_ &Guard, uint32_t *StreamToken) {
if (getThreadLocalStream() != CUstream{0})
return getThreadLocalStream();
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
uint32_t Token = EventWaitList[i]->getComputeStreamToken();
if (reinterpret_cast<ur_queue_handle_t>(EventWaitList[i]->getQueue()) ==
this &&
canReuseStream(Token)) {
std::unique_lock<std::mutex> ComputeSyncGuard(ComputeStreamSyncMutex);
// redo the check after lock to avoid data races on
// LastSyncComputeStreams
if (canReuseStream(Token)) {
uint32_t StreamI = Token % DelayCompute.size();
DelayCompute[StreamI] = true;
if (StreamToken) {
*StreamToken = Token;
}
Guard = ur_stream_guard_{std::move(ComputeSyncGuard)};
CUstream Result = EventWaitList[i]->getStream();
computeStreamWaitForBarrierIfNeeded(Result, StreamI);
return Result;
}
}
}
Guard = {};
return getNextComputeStream(StreamToken);
uint32_t
ur_queue_handle_t_::getEventComputeStreamToken(const ur_event_handle_t e) {
return e->getComputeStreamToken();
}

CUstream ur_queue_handle_t_::getNextTransferStream() {
if (getThreadLocalStream() != CUstream{0})
return getThreadLocalStream();
if (TransferStreams.empty()) { // for example in in-order queue
return getNextComputeStream();
}
if (NumTransferStreams < TransferStreams.size()) {
// the check above is for performance - so as not to lock mutex every time
std::lock_guard<std::mutex> Guuard(TransferStreamMutex);
// The second check is done after mutex is locked so other threads can not
// change NumTransferStreams after that
if (NumTransferStreams < TransferStreams.size()) {
UR_CHECK_ERROR(cuStreamCreateWithPriority(
&TransferStreams[NumTransferStreams], Flags, Priority));
++NumTransferStreams;
}
}
uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size();
CUstream Result = TransferStreams[StreamI];
transferStreamWaitForBarrierIfNeeded(Result, StreamI);
return Result;
CUstream ur_queue_handle_t_::getEventStream(const ur_event_handle_t e) {
return e->getStream();
}

/// Creates a `ur_queue_handle_t` object on the CUDA backend.
Expand Down
Loading
Loading