Skip to content

Commit 8e14aa3

Browse files
Merge remote-tracking branch 'origin/sycl' into HEAD
2 parents 369242a + b24454d commit 8e14aa3

File tree

30 files changed

+292
-235
lines changed

30 files changed

+292
-235
lines changed

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,11 @@ void CudaToolChain::addClangTargetOptions(
968968
"--nvptx-prec-sqrtf32=0"});
969969

970970
CC1Args.append({"-mllvm", "-enable-memcpyopt-without-libcalls"});
971+
972+
if (DriverArgs.hasFlag(options::OPT_fsycl_id_queries_fit_in_int,
973+
options::OPT_fno_sycl_id_queries_fit_in_int, false))
974+
CC1Args.append(
975+
{"-mllvm", "-nvvm-reflect-add=__CUDA_ID_QUERIES_FIT_IN_INT=1"});
971976
} else {
972977
CC1Args.append({"-fcuda-is-device", "-mllvm",
973978
"-enable-memcpyopt-without-libcalls",
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// REQUIRES: nvptx-registered-target
2+
3+
// RUN: %clang -### -nocudalib \
4+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \
5+
// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s
6+
7+
// RUN: %clang -### -nocudalib \
8+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-id-queries-fit-in-int %s 2>&1 \
9+
// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s
10+
11+
// RUN: %clang -### -nocudalib \
12+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-id-queries-fit-in-int %s 2>&1 \
13+
// RUN: | FileCheck --check-prefix=CHECK-INT %s
14+
15+
// CHECK-INT: "-mllvm" "-nvvm-reflect-add=__CUDA_ID_QUERIES_FIT_IN_INT=1"
16+
// CHECK-DEFAULT-NOT: "-nvvm-reflect-add=__CUDA_ID_QUERIES_FIT_IN_INT=1"

libclc/libspirv/lib/ptx-nvidiacl/workitem/get_global_id.cl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,31 @@
88

99
#include <libspirv/spirv.h>
1010

11+
extern int __nvvm_reflect_ocl(constant char *);
12+
1113
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_x() {
14+
if (__nvvm_reflect_ocl("__CUDA_ID_QUERIES_FIT_IN_INT")) {
15+
return (uint)__spirv_WorkgroupId_x() * (uint)__spirv_WorkgroupSize_x() +
16+
(uint)__spirv_LocalInvocationId_x() + (uint)__spirv_GlobalOffset_x();
17+
}
1218
return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() +
1319
__spirv_LocalInvocationId_x() + __spirv_GlobalOffset_x();
1420
}
1521

1622
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_y() {
23+
if (__nvvm_reflect_ocl("__CUDA_ID_QUERIES_FIT_IN_INT")) {
24+
return (uint)__spirv_WorkgroupId_y() * (uint)__spirv_WorkgroupSize_y() +
25+
(uint)__spirv_LocalInvocationId_y() + (uint)__spirv_GlobalOffset_y();
26+
}
1727
return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() +
1828
__spirv_LocalInvocationId_y() + __spirv_GlobalOffset_y();
1929
}
2030

2131
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_z() {
32+
if (__nvvm_reflect_ocl("__CUDA_ID_QUERIES_FIT_IN_INT")) {
33+
return (uint)__spirv_WorkgroupId_z() * (uint)__spirv_WorkgroupSize_z() +
34+
(uint)__spirv_LocalInvocationId_z() + (uint)__spirv_GlobalOffset_z();
35+
}
2236
return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() +
2337
__spirv_LocalInvocationId_z() + __spirv_GlobalOffset_z();
2438
}

libdevice/crt_wrapper.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,8 @@
1212

1313
#include <cstdint>
1414

15-
#ifndef __NVPTX__
1615
#define RAND_NEXT_LEN 1024
1716
DeviceGlobal<uint64_t[RAND_NEXT_LEN]> RandNext;
18-
#endif
1917

2018
#if defined(__SPIR__) || defined(__SPIRV__) || defined(__NVPTX__) || \
2119
defined(__AMDGCN__)
@@ -34,8 +32,6 @@ int memcmp(const void *s1, const void *s2, size_t n) {
3432
return __devicelib_memcmp(s1, s2, n);
3533
}
3634

37-
#ifndef __NVPTX__
38-
3935
// This simple rand is for ease of use only, the implementation aligns with
4036
// LLVM libc rand which is based on xorshift64star pseudo random number
4137
// generator. If work item number <= 1024, each work item has its own internal
@@ -107,8 +103,6 @@ void srand(unsigned int seed) {
107103
RAND_NEXT_ACC[gid1] = seed;
108104
}
109105

110-
#endif
111-
112106
#if defined(_WIN32)
113107
// Truncates a wide (16 or 32 bit) string (wstr) into an ASCII string (str).
114108
// Any non-ASCII characters are replaced by question mark '?'.

sycl/doc/design/SYCLBINDesign.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ clang-offload-packager invocation to clang-linker-wrapper together with the new
225225
`--syclbin` flag.
226226

227227
Setting this option will override `-fsycl`. Passing`-fsycl-device-only` with
228-
`-fsyclbin` will cause `-fsycl-device-only` to be considered unused.
228+
`-fsyclbin` will cause `-fsyclbin` to be considered unused.
229229

230230
The behavior is dependent on using the clang-linker-wrapper. As the current
231231
default offload compilation behavior is using the old offload model (driver

sycl/source/detail/cg.hpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -725,14 +725,10 @@ class CGHostTask : public CG {
725725
std::shared_ptr<detail::context_impl> MContext;
726726
std::vector<ArgDesc> MArgs;
727727

728-
CGHostTask(std::shared_ptr<HostTask> HostTask,
729-
std::shared_ptr<detail::queue_impl> Queue,
728+
CGHostTask(std::shared_ptr<HostTask> HostTask, detail::queue_impl *Queue,
730729
std::shared_ptr<detail::context_impl> Context,
731730
std::vector<ArgDesc> Args, CG::StorageInitHelper CGData,
732-
CGType Type, detail::code_location loc = {})
733-
: CG(Type, std::move(CGData), std::move(loc)),
734-
MHostTask(std::move(HostTask)), MQueue(Queue), MContext(Context),
735-
MArgs(std::move(Args)) {}
731+
CGType Type, detail::code_location loc = {});
736732
};
737733

738734
} // namespace detail

sycl/source/detail/event_impl.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ void event_impl::setQueue(queue_impl &Queue) {
218218
void event_impl::setSubmittedQueue(std::weak_ptr<queue_impl> SubmittedQueue) {
219219
MSubmittedQueue = std::move(SubmittedQueue);
220220
if (MHostProfilingInfo) {
221-
if (auto QueuePtr = MSubmittedQueue.lock()) {
221+
if (std::shared_ptr<queue_impl> QueuePtr = MSubmittedQueue.lock()) {
222222
device_impl &Device = QueuePtr->getDeviceImpl();
223223
MHostProfilingInfo->setDevice(&Device);
224224
}
@@ -251,7 +251,7 @@ void *event_impl::instrumentationProlog(std::string &Name, int32_t StreamID,
251251
// queue is available with the wait events. We check to see if the
252252
// TraceEvent is available in the Queue object.
253253
void *TraceEvent = nullptr;
254-
if (QueueImplPtr Queue = MQueue.lock()) {
254+
if (std::shared_ptr<queue_impl> Queue = MQueue.lock()) {
255255
TraceEvent = Queue->getTraceEvent();
256256
WaitEvent =
257257
(TraceEvent ? static_cast<xpti_td *>(TraceEvent) : GSYCLGraphEvent);
@@ -320,7 +320,7 @@ void event_impl::wait_and_throw(
320320
std::shared_ptr<sycl::detail::event_impl> Self) {
321321
wait(Self);
322322

323-
if (QueueImplPtr SubmittedQueue = MSubmittedQueue.lock())
323+
if (std::shared_ptr<queue_impl> SubmittedQueue = MSubmittedQueue.lock())
324324
SubmittedQueue->throw_asynchronous();
325325
}
326326

@@ -465,7 +465,7 @@ event_impl::get_backend_info<info::platform::version>() const {
465465
"the info::platform::version info descriptor can "
466466
"only be queried with an OpenCL backend");
467467
}
468-
if (QueueImplPtr Queue = MQueue.lock()) {
468+
if (std::shared_ptr<queue_impl> Queue = MQueue.lock()) {
469469
return Queue->getDeviceImpl()
470470
.get_platform()
471471
.get_info<info::platform::version>();
@@ -488,7 +488,7 @@ event_impl::get_backend_info<info::device::version>() const {
488488
"the info::device::version info descriptor can only "
489489
"be queried with an OpenCL backend");
490490
}
491-
if (QueueImplPtr Queue = MQueue.lock()) {
491+
if (std::shared_ptr<queue_impl> Queue = MQueue.lock()) {
492492
return Queue->getDeviceImpl().get_info<info::device::version>();
493493
}
494494
return ""; // If the queue has been released, no device will be associated so
@@ -555,21 +555,21 @@ std::vector<EventImplPtr> event_impl::getWaitList() {
555555
return Result;
556556
}
557557

558-
void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) {
558+
void event_impl::flushIfNeeded(queue_impl *UserQueue) {
559559
// Some events might not have a native handle underneath even at this point,
560560
// e.g. those produced by memset with 0 size (no UR call is made).
561561
auto Handle = this->getHandle();
562562
if (MIsFlushed || !Handle)
563563
return;
564564

565-
QueueImplPtr Queue = MQueue.lock();
565+
std::shared_ptr<queue_impl> Queue = MQueue.lock();
566566
// If the queue has been released, all of the commands have already been
567567
// implicitly flushed by urQueueRelease.
568568
if (!Queue) {
569569
MIsFlushed = true;
570570
return;
571571
}
572-
if (Queue == UserQueue)
572+
if (Queue.get() == UserQueue)
573573
return;
574574

575575
// Check if the task for this event has already been submitted.
@@ -607,9 +607,9 @@ void event_impl::setSubmissionTime() {
607607
if (!MIsProfilingEnabled && !MProfilingTagEvent)
608608
return;
609609

610-
std::weak_ptr<queue_impl> Queue = isHost() ? MSubmittedQueue : MQueue;
611-
if (QueueImplPtr QueuePtr = Queue.lock()) {
612-
device_impl &Device = QueuePtr->getDeviceImpl();
610+
if (std::shared_ptr<queue_impl> Queue =
611+
isHost() ? MSubmittedQueue.lock() : MQueue.lock()) {
612+
device_impl &Device = Queue->getDeviceImpl();
613613
MSubmitTime = getTimestamp(&Device);
614614
}
615615
}

sycl/source/detail/event_impl.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ class Adapter;
3131
class context_impl;
3232
using ContextImplPtr = std::shared_ptr<sycl::detail::context_impl>;
3333
class queue_impl;
34-
using QueueImplPtr = std::shared_ptr<sycl::detail::queue_impl>;
3534
class event_impl;
3635
using EventImplPtr = std::shared_ptr<sycl::detail::event_impl>;
3736

@@ -238,7 +237,7 @@ class event_impl : public std::enable_shared_from_this<event_impl> {
238237
/// Performs a flush on the queue associated with this event if the user queue
239238
/// is different and the task associated with this event hasn't been submitted
240239
/// to the device yet.
241-
void flushIfNeeded(const QueueImplPtr &UserQueue);
240+
void flushIfNeeded(queue_impl *UserQueue);
242241

243242
/// Cleans dependencies of this event_impl.
244243
void cleanupDependencyEvents();
@@ -258,7 +257,9 @@ class event_impl : public std::enable_shared_from_this<event_impl> {
258257
///
259258
/// @return shared_ptr to MWorkerQueue, please be aware it can be empty
260259
/// pointer
261-
QueueImplPtr getWorkerQueue() { return MWorkerQueue.lock(); };
260+
std::shared_ptr<sycl::detail::queue_impl> getWorkerQueue() {
261+
return MWorkerQueue.lock();
262+
};
262263

263264
/// Sets worker queue for command.
264265
///
@@ -285,7 +286,9 @@ class event_impl : public std::enable_shared_from_this<event_impl> {
285286
/// @return Submission time for command associated with this event
286287
uint64_t getSubmissionTime();
287288

288-
QueueImplPtr getSubmittedQueue() const { return MSubmittedQueue.lock(); };
289+
std::shared_ptr<sycl::detail::queue_impl> getSubmittedQueue() const {
290+
return MSubmittedQueue.lock();
291+
};
289292

290293
/// Checks if this event is complete.
291294
///

sycl/source/detail/graph_impl.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ class node_impl : public std::enable_shared_from_this<node_impl> {
288288

289289
return std::make_unique<sycl::detail::CGHostTask>(
290290
sycl::detail::CGHostTask(
291-
std::move(HostTaskSPtr), CommandGroupPtr->MQueue,
291+
std::move(HostTaskSPtr), CommandGroupPtr->MQueue.get(),
292292
CommandGroupPtr->MContext, std::move(NewArgs), std::move(Data),
293293
CommandGroupPtr->getType(), Loc));
294294
}

sycl/source/detail/queue_impl.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -650,9 +650,12 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
650650
// for in order ones.
651651
void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);
652652

653-
static ContextImplPtr getContext(const QueueImplPtr &Queue) {
653+
static ContextImplPtr getContext(queue_impl *Queue) {
654654
return Queue ? Queue->getContextImplPtr() : nullptr;
655655
}
656+
static ContextImplPtr getContext(const QueueImplPtr &Queue) {
657+
return getContext(Queue.get());
658+
}
656659

657660
// Must be called under MMutex protection
658661
void doUnenqueuedCommandCleanup(

0 commit comments

Comments
 (0)