Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,40 @@ lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT,

static void lowerLocalMemCall(Function *LocalMemAllocFunc,
std::function<void(CallInst *CI)> TransformCall) {
static SmallPtrSet<Function *, 16> FuncsCache;
SmallVector<CallInst *, 4> DelCalls;
for (User *U : LocalMemAllocFunc->users()) {
auto *CI = cast<CallInst>(U);
TransformCall(CI);
DelCalls.push_back(CI);
// Now, take each kernel that calls the builtins that allocate local memory,
// either directly or through a series of function calls that eventually end
// up in a direct call to the builtin, and attach the
// work-group-memory-static attribute to the kernel if not already attached.
// This is needed because free function kernels do not have the attribute
// added by the library as is the case with other types of kernels.
if (!FuncsCache.insert(CI->getFunction()).second)
continue; // We have already traversed call graph from this function.

SmallVector<Function *, 8> WorkList;
WorkList.push_back(CI->getFunction());
while (!WorkList.empty()) {
Function *F = WorkList.back();
WorkList.pop_back();

// Mark kernel as using scratch memory if it isn't marked already.
if (F->getCallingConv() == CallingConv::SPIR_KERNEL &&
!F->hasFnAttribute(WORK_GROUP_STATIC_ATTR))
F->addFnAttr(WORK_GROUP_STATIC_ATTR);

for (auto *FU : F->users()) {
if (auto *UCI = dyn_cast<CallInst>(FU)) {
if (FuncsCache.insert(UCI->getFunction()).second)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This if can be merged together with the one above, I think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think they cannot be merged since this if is inside the while loop and the one on line 199 is not, they functions that are checked by these if statements may potentially be different.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean line 214 and 215.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compiler seems to complain when I merge them.

WorkList.push_back(UCI->getFunction());
} // Even though there could be other uses of a Function, we don't
// care about them because we are only concerned about call graph.
}
}
}

for (auto *CI : DelCalls) {
Expand Down
20 changes: 20 additions & 0 deletions llvm/test/SYCLLowerIR/work_group_static.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,29 @@ entry:
ret void
}

; Function Attrs: convergent norecurse
; CHECK: @__sycl_kernel_B{{.*}} #[[ATTRS:[0-9]+]]
define weak_odr dso_local spir_kernel void @__sycl_kernel_B(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 {
entry:
%1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1
ret void
}

; Function Attrs: convergent norecurse
; CHECK: @__sycl_kernel_C{{.*}} #[[ATTRS]]
define weak_odr dso_local spir_kernel void @__sycl_kernel_C(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 {
entry:
%1 = tail call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 128, i64 4) #1
ret void
}

; Function Attrs: convergent
declare dso_local spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64, i64) local_unnamed_addr #1

; Function Attrs: convergent
declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1

; CHECK: #[[ATTRS]] = {{.*}} "sycl-work-group-static"
attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" }
attributes #1 = { convergent norecurse }

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// REQUIRES: aspect-usm_shared_allocations
// UNSUPPORTED: target-amd
// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// This test verifies that we can compile, run and get correct results when
// using a free function kernel that allocates shared local memory in a kernel
// either by way of the work group scratch memory extension or the work group
// static memory extension.

#include "helpers.hpp"

#include <cassert>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this one should be the last one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The formatter disagrees, I have made the change manually so hopefully it wont fail the formatter pre-commit check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the formatter doesn't accept this so I am reverting it.

#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
#include <sycl/ext/oneapi/free_function_queries.hpp>
#include <sycl/ext/oneapi/work_group_static.hpp>
#include <sycl/group_barrier.hpp>
#include <sycl/usm.hpp>

namespace syclext = sycl::ext::oneapi;
namespace syclexp = sycl::ext::oneapi::experimental;

constexpr int SIZE = 16;

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
void scratchKernel(float *src, float *dst) {
size_t lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It applies to the whole file: some variable names are not compliant with LLVM Coding Standards.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I made the changes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that variable names should start with an upper-case letter.

float *localMem =
reinterpret_cast<float *>(syclexp::get_work_group_scratch_memory());
localMem[lid] = 2 * src[lid];
dst[lid] = localMem[lid];
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
void staticKernel(float *src, float *dst) {
sycl::nd_item<1> item = syclext::this_work_item::get_nd_item<1>();
size_t lid = item.get_local_linear_id();
syclexp::work_group_static<float[SIZE]> localMem;
localMem[lid] = src[lid] * src[lid];
sycl::group_barrier(item.get_group());
if (item.get_group().leader()) { // Check that memory is indeed shared between
// the work group.
for (int i = 0; i < SIZE; ++i)
assert(localMem[i] == src[i] * src[i]);
}
dst[lid] = localMem[lid];
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
void scratchStaticKernel(float *src, float *dst) {
size_t lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id();
float *scratchMem =
reinterpret_cast<float *>(syclexp::get_work_group_scratch_memory());
syclexp::work_group_static<float[SIZE]> staticMem;
scratchMem[lid] = src[lid];
staticMem[lid] = src[lid];
dst[lid] = scratchMem[lid] + staticMem[lid];
}

int main() {
sycl::queue q;
float *src = sycl::malloc_shared<float>(SIZE, q);
float *dst = sycl::malloc_shared<float>(SIZE, q);

for (int i = 0; i < SIZE; i++) {
src[i] = i;
}

auto scratchBndl =
syclexp::get_kernel_bundle<scratchKernel, sycl::bundle_state::executable>(
q.get_context());
auto staticBndl =
syclexp::get_kernel_bundle<staticKernel, sycl::bundle_state::executable>(
q.get_context());
auto scratchStaticBndl = syclexp::get_kernel_bundle<
scratchStaticKernel, sycl::bundle_state::executable>(q.get_context());

sycl::kernel scratchKrn =
scratchBndl.template ext_oneapi_get_kernel<scratchKernel>();
sycl::kernel staticKrn =
staticBndl.template ext_oneapi_get_kernel<staticKernel>();
sycl::kernel scratchStaticKrn =
scratchStaticBndl.template ext_oneapi_get_kernel<scratchStaticKernel>();
syclexp::launch_config scratchKernelcfg{
::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE)),
syclexp::properties{
syclexp::work_group_scratch_size(SIZE * sizeof(float))}};
syclexp::launch_config staticKernelcfg{
::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE))};

syclexp::nd_launch(q, scratchKernelcfg, scratchKrn, src, dst);
q.wait();
for (int i = 0; i < SIZE; i++) {
assert(dst[i] == 2 * src[i]);
}

syclexp::nd_launch(q, staticKernelcfg, staticKrn, src, dst);
q.wait();
for (int i = 0; i < SIZE; i++) {
assert(dst[i] == src[i] * src[i]);
}

syclexp::nd_launch(q, scratchKernelcfg, scratchStaticKrn, src, dst);
q.wait();
for (int i = 0; i < SIZE; i++) {
assert(dst[i] == 2 * src[i]);
}

sycl::free(src, q);
sycl::free(dst, q);
return 0;
}
Loading