Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion buildbot/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def do_configure(args, passthrough_args):
if not os.path.isdir(abs_obj_dir):
os.makedirs(abs_obj_dir)

llvm_external_projects = "sycl;llvm-spirv;opencl;xpti;xptifw"
llvm_external_projects = "sycl;llvm-spirv;opencl;xpti;xptifw;compiler-rt"

# libdevice build requires a working SYCL toolchain, which is not the case
# with macOS target right now.
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ getInstrProfOptions(const CodeGenOptions &CodeGenOpts,
Options.InstrProfileOutput = CodeGenOpts.ContinuousProfileSync
? ("%c" + CodeGenOpts.InstrProfileOutput)
: CodeGenOpts.InstrProfileOutput;
Options.Atomic = CodeGenOpts.AtomicProfileUpdate;
Options.Atomic = LangOpts.SYCLIsDevice || CodeGenOpts.AtomicProfileUpdate;
return Options;
}

Expand Down
6 changes: 1 addition & 5 deletions clang/lib/Driver/ToolChains/SYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1574,11 +1574,7 @@ static ArrayRef<options::ID> getUnsupportedOpts() {
options::OPT_fno_profile_generate, // -f[no-]profile-generate
options::OPT_ftest_coverage,
options::OPT_fno_test_coverage, // -f[no-]test-coverage
options::OPT_fcoverage_mapping,
options::OPT_coverage, // --coverage
options::OPT_fno_coverage_mapping, // -f[no-]coverage-mapping
options::OPT_fprofile_instr_generate,
options::OPT_fprofile_instr_generate_EQ,
options::OPT_coverage, // --coverage
options::OPT_fprofile_arcs,
options::OPT_fno_profile_arcs, // -f[no-]profile-arcs
options::OPT_fno_profile_instr_generate, // -f[no-]profile-instr-generate
Expand Down
13 changes: 0 additions & 13 deletions clang/test/Driver/sycl-unsupported.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,6 @@
// RUN: -DOPT_CC1=-debug-info-kind=line-tables-only \
// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT

// RUN: %clangxx -fsycl -fprofile-instr-generate -### %s 2>&1 \
// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fprofile-instr-generate \
// RUN: -DOPT_CC1=-fprofile-instrument=clang \
// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT
// RUN: %clangxx -fsycl -fcoverage-mapping \
// RUN: -fprofile-instr-generate -### %s 2>&1 \
// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fcoverage-mapping
// RUN: %clangxx -fsycl -ftest-coverage -### %s 2>&1 \
// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-ftest-coverage \
// RUN: -DOPT_CC1=-coverage-notes-file \
Expand All @@ -49,12 +42,6 @@
// RUN: | FileCheck %s -DARCH=spir64 -DOPT=--coverage \
// RUN: -DOPT_CC1=-coverage-notes-file \
// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT
// Check to make sure our '-fsanitize=address' exception isn't triggered by a
// different option
// RUN: %clangxx -fsycl -fprofile-instr-generate=address -### %s 2>&1 \
// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fprofile-instr-generate=address \
// RUN: -DOPT_CC1=-fprofile-instrument=clang \
// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT

// CHECK: ignoring '[[OPT]]' option as it is not currently supported for target '[[ARCH]]{{.*}}'; only supported for host compilation [-Woption-ignored]
// CHECK-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "[[OPT]]{{.*}}"
Expand Down
16 changes: 16 additions & 0 deletions compiler-rt/lib/profile/InstrProfilingRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@ extern "C" {

#include "InstrProfiling.h"

void __sycl_increment_profile_counters(uint64_t FnHash, size_t NumCounters,
const uint64_t *Increments) {
for (const __llvm_profile_data *DataVar = __llvm_profile_begin_data();
DataVar < __llvm_profile_end_data(); DataVar++) {
if (DataVar->NameRef != FnHash || DataVar->NumCounters != NumCounters)
continue;

uint64_t *const Counters = reinterpret_cast<uint64_t *>(
reinterpret_cast<uintptr_t>(DataVar) +
reinterpret_cast<uintptr_t>(DataVar->CounterPtr));
for (size_t i = 0; i < NumCounters; i++)
Counters[i] += Increments[i];
break;
}
}

static int RegisterRuntime() {
__llvm_profile_initialize();
#ifdef _AIX
Expand Down
36 changes: 36 additions & 0 deletions llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,9 @@ bool InstrLowerer::lower() {
if (!NeedsRuntimeHook && ContainsProfiling)
emitRuntimeHook();

if (M.getTargetTriple().isSPIR())
return true;

emitRegistration();
emitUses();
emitInitialization();
Expand Down Expand Up @@ -1116,6 +1119,22 @@ GlobalVariable *InstrLowerer::getOrCreateBiasVar(StringRef VarName) {
}

Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
if (M.getTargetTriple().isSPIR()) {
auto *Counters = getOrCreateRegionCounters(I);
IRBuilder<> Builder(I);
auto *Addr = Builder.CreateLoad(PointerType::get(M.getContext(), 1),
Counters, "pgocount.addr");
const std::uint64_t Index = I->getIndex()->getZExtValue();
if (Index > 0) {
auto *Offset = Builder.getInt64(I->getIndex()->getZExtValue() *
sizeof(std::uint64_t));
auto *AddrWithOffset =
Builder.CreatePtrAdd(Addr, Offset, "pgocount.offset");
return AddrWithOffset;
}
return Addr;
}

auto *Counters = getOrCreateRegionCounters(I);
IRBuilder<> Builder(I);

Expand Down Expand Up @@ -1657,6 +1676,23 @@ InstrLowerer::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) {
GlobalVariable *
InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
GlobalValue::LinkageTypes Linkage) {
if (M.getTargetTriple().isSPIR()) {
uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
auto &Ctx = M.getContext();
GlobalVariable *GV;
auto *PtrTy = PointerType::get(Ctx, 1);
auto *IntTy = Type::getInt64Ty(Ctx);
auto *StructTy = StructType::get(Ctx, {PtrTy, IntTy});
GV = new GlobalVariable(M, StructTy, false, Linkage,
Constant::getNullValue(StructTy), Name);
const std::uint64_t FnHash = IndexedInstrProf::ComputeHash(
getPGOFuncNameVarInitializer(Inc->getName()));
const std::string FnName = std::string{"__profc_"} + std::to_string(FnHash);
GV->addAttribute("sycl-unique-id", FnName);
GV->addAttribute("sycl-device-global-size", Twine(NumCounters * 8).str());
return GV;
}

uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
auto &Ctx = M.getContext();
GlobalVariable *GV;
Expand Down
3 changes: 3 additions & 0 deletions sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,16 @@ add_custom_target(sycl-compiler
clang-offload-extract
clang-offload-packager
clang-linker-wrapper
compiler-rt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do other projects enable profiling unconditionally in the build too?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the only remaining question from me, the rest is 🔥 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know of any projects that enable profiling unconditionally. This change enables support for coverage instrumentation in our build but does not enable the instrumentation itself. I think it makes sense to do this unconditionally so that the related functionality in the SYCL runtime can be tested as part of our regular end-to-end testing process.

I'd like to add a build flag for building the SYCL runtime with instrumentation enabled. This flag would allow us to produce coverage reports and identify areas in the runtime that are not covered by tests. I plan to do this in a separate PR and make the instrumentation disabled by default.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to do this unconditionally so that the related functionality in the SYCL runtime can be tested as part of our regular end-to-end testing process.

I'd argue that that would be a reason to enable it in configure.py under --ci-defaults, but I'd still think that this unconditional dependency here is too much.

That said, if others are ok, I'm not going to block PR on this.

file-table-tform
llc
llvm-ar
llvm-foreach
llvm-spirv
llvm-link
llvm-objcopy
llvm-profdata
llvm-cov
spirv-to-ir-wrapper
sycl-post-link
opencl-aot
Expand Down
5 changes: 5 additions & 0 deletions sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ context_impl::~context_impl() {
if (DGEntry != nullptr)
DGEntry->removeAssociatedResources(this);
}
// Free all profile counter USM allocations associated with this context.
for (DeviceGlobalMapEntry *DGEntry :
detail::ProgramManager::getInstance()
.getProfileCounterDeviceGlobalEntries(this))
DGEntry->cleanupProfileCounter(this);
MCachedLibPrograms.clear();
// TODO catch an exception and put it to list of asynchronous exceptions
getAdapter().call_nocheck<UrApiKind::urContextRelease>(MContext);
Expand Down
15 changes: 14 additions & 1 deletion sycl/source/detail/device_global_map.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ class DeviceGlobalMap {
// cannot be set until registration happens.
auto EntryUPtr = std::make_unique<DeviceGlobalMapEntry>(
DeviceGlobal->Name, Img, TypeSize, DeviceImageScopeDecorated);
MDeviceGlobals.emplace(DeviceGlobal->Name, std::move(EntryUPtr));
auto NewEntry =
MDeviceGlobals.emplace(DeviceGlobal->Name, std::move(EntryUPtr));
if (NewEntry.first->second->isProfileCounter())
MProfileCounterDeviceGlobals.push_back(NewEntry.first->second.get());
}
}
}
Expand Down Expand Up @@ -114,6 +117,8 @@ class DeviceGlobalMap {
auto EntryUPtr =
std::make_unique<DeviceGlobalMapEntry>(UniqueId, DeviceGlobalPtr);
auto NewEntry = MDeviceGlobals.emplace(UniqueId, std::move(EntryUPtr));
if (NewEntry.first->second->isProfileCounter())
MProfileCounterDeviceGlobals.push_back(NewEntry.first->second.get());
MPtr2DeviceGlobal.insert({DeviceGlobalPtr, NewEntry.first->second.get()});
}

Expand Down Expand Up @@ -154,6 +159,11 @@ class DeviceGlobalMap {
}
}

std::vector<DeviceGlobalMapEntry *> getProfileCounterEntries() {
std::lock_guard<std::mutex> DeviceGlobalsGuard(MDeviceGlobalsMutex);
return MProfileCounterDeviceGlobals;
}

const std::unordered_map<const void *, DeviceGlobalMapEntry *>
getPointerMap() const {
return MPtr2DeviceGlobal;
Expand All @@ -177,6 +187,9 @@ class DeviceGlobalMap {
MDeviceGlobals;
std::unordered_map<const void *, DeviceGlobalMapEntry *> MPtr2DeviceGlobal;

// List of profile counter device globals.
std::vector<DeviceGlobalMapEntry *> MProfileCounterDeviceGlobals;

/// Protects MDeviceGlobals and MPtr2DeviceGlobal.
std::mutex MDeviceGlobalsMutex;
};
Expand Down
62 changes: 60 additions & 2 deletions sycl/source/detail/device_global_map_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,62 @@ OwnedUrEvent DeviceGlobalUSMMem::getInitEvent(adapter_impl &Adapter) {
}
}

bool DeviceGlobalMapEntry::isAvailableInContext(const context_impl *CtxImpl) {
std::lock_guard<std::mutex> Lock{MDeviceToUSMPtrMapMutex};
for (const auto &It : MDeviceToUSMPtrMap)
if (It.first.second == CtxImpl)
return true;
return false;
}

bool DeviceGlobalMapEntry::isProfileCounter() {
const std::string CounterPrefix = "__profc_";
return MUniqueId.substr(0, CounterPrefix.size()) == CounterPrefix;
}

extern "C" void __attribute__((weak))
__sycl_increment_profile_counters(std::uint64_t FnHash, std::size_t NumCounters,
const std::uint64_t *Increments);

void DeviceGlobalMapEntry::cleanupProfileCounter(context_impl *CtxImpl) {
std::lock_guard<std::mutex> Lock{MDeviceToUSMPtrMapMutex};
const std::size_t NumCounters = MDeviceGlobalTSize / sizeof(std::uint64_t);
const std::uint64_t FnHash = [&] {
const auto PrefixSize = std::string{"__profc_"}.size();
constexpr int DecimalBase = 10;
return std::strtoull(MUniqueId.substr(PrefixSize).c_str(), nullptr,
DecimalBase);
}();
for (const device_impl &Device : CtxImpl->getDevices()) {
auto USMPtrIt = MDeviceToUSMPtrMap.find({&Device, CtxImpl});
if (USMPtrIt != MDeviceToUSMPtrMap.end()) {
DeviceGlobalUSMMem &USMMem = USMPtrIt->second;

// Get the increments from the USM pointer.
std::vector<std::uint64_t> Increments(NumCounters);
const std::uint64_t *Counters = static_cast<std::uint64_t *>(USMMem.MPtr);
for (std::size_t I = 0; I < NumCounters; ++I)
Increments[I] = Counters[I];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA and L0 adapters support urUSMContextMemcpyExp, which could allow this to also use device allocations. The problem with using shared memory is that we both require that the device support shared memory allocation and that the backend is able to detect shared memory dependencies coming from global variables. Using urUSMContextMemcpyExp does make this a blocking operation however, so I don't know what is better.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially used urUSMContextMemcpyExp but then realized it's not implemented for the OpenCL adapter so we couldn't use this feature on CPU. Once the adapter includes an implementation for this UR function, I think it would make sense to do the blocking copy from a device USM allocation back to the host.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, good point! Will you open some trackers for this? I.e. a tracker for switching to device allocations and urUSMContextMemcpyExp and make it blocked by a tracker for adding support for it in the OpenCL adapter.


// Call the weak symbol to update the profile counters.
if (__sycl_increment_profile_counters)
__sycl_increment_profile_counters(FnHash, Increments.size(),
Increments.data());

// Free the USM memory and release the event if it exists.
detail::usm::freeInternal(USMMem.MPtr, CtxImpl);
if (USMMem.MInitEvent != nullptr)
CtxImpl->getAdapter().call<UrApiKind::urEventRelease>(
USMMem.MInitEvent);

// Set to nullptr to avoid double free.
USMMem.MPtr = nullptr;
USMMem.MInitEvent = nullptr;
MDeviceToUSMPtrMap.erase(USMPtrIt);
}
}
}

DeviceGlobalUSMMem &
DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) {
assert(!MIsDeviceImageScopeDecorated &&
Expand All @@ -67,7 +123,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) {
return DGUSMPtr->second;

void *NewDGUSMPtr = detail::usm::alignedAllocInternal(
0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, sycl::usm::alloc::device);
0, MDeviceGlobalTSize, &CtxImpl, &DevImpl,
isProfileCounter() ? sycl::usm::alloc::shared : sycl::usm::alloc::device);

auto NewAllocIt = MDeviceToUSMPtrMap.emplace(
std::piecewise_construct, std::forward_as_tuple(&DevImpl, &CtxImpl),
Expand Down Expand Up @@ -125,7 +182,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) {
return DGUSMPtr->second;

void *NewDGUSMPtr = detail::usm::alignedAllocInternal(
0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, sycl::usm::alloc::device);
0, MDeviceGlobalTSize, &CtxImpl, &DevImpl,
isProfileCounter() ? sycl::usm::alloc::shared : sycl::usm::alloc::device);

auto NewAllocIt = MDeviceToUSMPtrMap.emplace(
std::piecewise_construct, std::forward_as_tuple(&DevImpl, &CtxImpl),
Expand Down
9 changes: 9 additions & 0 deletions sycl/source/detail/device_global_map_entry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@ struct DeviceGlobalMapEntry {
MIsDeviceImageScopeDecorated = IsDeviceImageScopeDecorated;
}

// Checks if the device_global is available in the given context.
bool isAvailableInContext(const context_impl *CtxImpl);

// Returns true if the device_global is a profile counter.
bool isProfileCounter();

// Cleans up a profile counter device global.
void cleanupProfileCounter(context_impl *CtxImpl);

// Gets or allocates USM memory for a device_global.
DeviceGlobalUSMMem &getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl);

Expand Down
13 changes: 13 additions & 0 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2466,6 +2466,19 @@ std::vector<DeviceGlobalMapEntry *> ProgramManager::getDeviceGlobalEntries(
return FoundEntries;
}

std::vector<DeviceGlobalMapEntry *>
ProgramManager::getProfileCounterDeviceGlobalEntries(
const context_impl *CtxImpl) {
const std::vector<DeviceGlobalMapEntry *> ProfileCounters =
ProgramManager::getInstance().m_DeviceGlobals.getProfileCounterEntries();
std::vector<DeviceGlobalMapEntry *> FoundEntries;
for (const auto &DGEntry : ProfileCounters) {
if (DGEntry->isAvailableInContext(CtxImpl))
FoundEntries.push_back(DGEntry);
}
return FoundEntries;
}

void ProgramManager::addOrInitHostPipeEntry(const void *HostPipePtr,
const char *UniqueId) {
std::lock_guard<std::mutex> HostPipesGuard(m_HostPipesMutex);
Expand Down
5 changes: 5 additions & 0 deletions sycl/source/detail/program_manager/program_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,11 @@ class ProgramManager {
std::vector<DeviceGlobalMapEntry *>
getDeviceGlobalEntries(const std::vector<std::string> &UniqueIds,
bool ExcludeDeviceImageScopeDecorated = false);

// The function gets all device_global entries that are profile counters.
std::vector<DeviceGlobalMapEntry *>
getProfileCounterDeviceGlobalEntries(const context_impl *CtxImpl);

// The function inserts or initializes a host_pipe entry into the
// host_pipe map.
void addOrInitHostPipeEntry(const void *HostPipePtr, const char *UniqueId);
Expand Down
Loading
Loading