Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 97 additions & 191 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,9 +732,6 @@ static void
setSpecializationConstants(const std::shared_ptr<device_image_impl> &InputImpl,
ur_program_handle_t Prog,
const AdapterPtr &Adapter) {
// Set ITT annotation specialization constant if needed.
enableITTAnnotationsIfNeeded(Prog, Adapter);

std::lock_guard<std::mutex> Lock{InputImpl->get_spec_const_data_lock()};
const std::map<std::string, std::vector<device_image_impl::SpecConstDescT>>
&SpecConstData = InputImpl->get_spec_const_data_ref();
Expand Down Expand Up @@ -769,15 +766,6 @@ ur_program_handle_t ProgramManager::getBuiltURProgram(
const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl,
const std::string &KernelName, const NDRDescT &NDRDesc,
bool JITCompilationIsRequired) {
KernelProgramCache &Cache = ContextImpl->getKernelProgramCache();

std::string CompileOpts;
std::string LinkOpts;

applyOptionsFromEnvironment(CompileOpts, LinkOpts);

SerializedObj SpecConsts;

// Check if we can optimize program builds for sub-devices by using a program
// built for the root device
DeviceImplPtr RootDevImpl = DeviceImpl;
Expand Down Expand Up @@ -824,19 +812,36 @@ ur_program_handle_t ProgramManager::getBuiltURProgram(
AllImages.push_back(&Img);
std::copy(ImageDeps.begin(), ImageDeps.end(), std::back_inserter(AllImages));

auto BuildF = [this, &Img, &Context, &ContextImpl, &Device, &CompileOpts,
return getBuiltURProgram(Img, Context, {Device}, DeviceImagesToLink,
AllImages);
}

ur_program_handle_t ProgramManager::getBuiltURProgram(
const RTDeviceBinaryImage &Img, const context &Context,
const std::vector<device> &Devs,
const std::set<RTDeviceBinaryImage *> &DeviceImagesToLink,
const std::vector<const RTDeviceBinaryImage *> &AllImages,
const std::shared_ptr<device_image_impl> &DeviceImageImpl,
const SerializedObj &SpecConsts) {
std::string CompileOpts;
std::string LinkOpts;
applyOptionsFromEnvironment(CompileOpts, LinkOpts);
auto BuildF = [this, &Img, &DeviceImageImpl, &Context, &Devs, &CompileOpts,
&LinkOpts, SpecConsts, &DeviceImagesToLink, &AllImages] {
const ContextImplPtr &ContextImpl = getSyclObjImpl(Context);
const AdapterPtr &Adapter = ContextImpl->getAdapter();
applyOptionsFromImage(CompileOpts, LinkOpts, Img, {Device}, Adapter);
applyOptionsFromImage(CompileOpts, LinkOpts, Img, Devs, Adapter);
// Should always come last!
appendCompileEnvironmentVariablesThatAppend(CompileOpts);
appendLinkEnvironmentVariablesThatAppend(LinkOpts);

auto [NativePrg, DeviceCodeWasInCache] = getOrCreateURProgram(
Img, AllImages, Context, {Device}, CompileOpts + LinkOpts, SpecConsts);
Img, {AllImages}, Context, Devs, CompileOpts + LinkOpts, SpecConsts);

if (!DeviceCodeWasInCache) {
if (Img.supportsSpecConstants())
enableITTAnnotationsIfNeeded(NativePrg, Adapter);
if (!DeviceCodeWasInCache && Img.supportsSpecConstants()) {
enableITTAnnotationsIfNeeded(NativePrg, Adapter);
if (DeviceImageImpl)
setSpecializationConstants(DeviceImageImpl, NativePrg, Adapter);
}

UrFuncInfo<UrApiKind::urProgramRelease> programReleaseInfo;
Expand Down Expand Up @@ -864,34 +869,28 @@ ur_program_handle_t ProgramManager::getBuiltURProgram(
for (RTDeviceBinaryImage *BinImg : DeviceImagesToLink) {
if (UseDeviceLibs)
DeviceLibReqMask |= getDeviceLibReqMask(*BinImg);
device_image_plain DevImagePlain =
getDeviceImageFromBinaryImage(BinImg, Context, Device);
const std::shared_ptr<detail::device_image_impl> &DeviceImageImpl =
detail::getSyclObjImpl(DevImagePlain);

SerializedObj ImgSpecConsts =
DeviceImageImpl->get_spec_const_blob_ref();

ur_program_handle_t NativePrg =
createURProgram(*BinImg, Context, {Device});
ur_program_handle_t NativePrg = createURProgram(*BinImg, Context, Devs);

if (BinImg->supportsSpecConstants())
setSpecializationConstants(DeviceImageImpl, NativePrg, Adapter);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this spec constant part was intentionally removed since it did nothing except enable ITT annotations because the newly created device image contained no spec constant values.

enableITTAnnotationsIfNeeded(NativePrg, Adapter);

ProgramsToLink.push_back(NativePrg);
}
}
std::vector<ur_device_handle_t> Devs = {
getSyclObjImpl(Device).get()->getHandleRef()};
;

std::vector<ur_device_handle_t> URDevices;
for (auto Dev : Devs)
URDevices.push_back(getSyclObjImpl(Dev).get()->getHandleRef());

ProgramPtr BuiltProgram = build(
std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, Devs,
DeviceLibReqMask, ProgramsToLink,
std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts,
URDevices, DeviceLibReqMask, ProgramsToLink,
/*CreatedFromBinary*/ Img.getFormat() != SYCL_DEVICE_BINARY_TYPE_SPIRV);

// Those extra programs won't be used anymore, just the final linked result
for (ur_program_handle_t Prg : ProgramsToLink)
Adapter->call<UrApiKind::urProgramRelease>(Prg);

emitBuiltProgramInfo(BuiltProgram.get(), ContextImpl);

{
Expand All @@ -902,57 +901,98 @@ ur_program_handle_t ProgramManager::getBuiltURProgram(
}
}

ContextImpl->addDeviceGlobalInitializer(BuiltProgram.get(), {Device}, &Img);
ContextImpl->addDeviceGlobalInitializer(BuiltProgram.get(), Devs, &Img);

// Save program to persistent cache if it is not there
if (!DeviceCodeWasInCache) {
PersistentDeviceCodeCache::putItemToDisc({Device}, AllImages, SpecConsts,
PersistentDeviceCodeCache::putItemToDisc(Devs, AllImages, SpecConsts,
CompileOpts + LinkOpts,
BuiltProgram.get());
}

return BuiltProgram.release();
};

if (!SYCLConfig<SYCL_CACHE_IN_MEM>::get())
return BuildF();

uint32_t ImgId = Img.getImageID();
const ur_device_handle_t UrDevice = Dev->getHandleRef();
auto CacheKey = std::make_pair(std::make_pair(std::move(SpecConsts), ImgId),
std::set<ur_device_handle_t>{UrDevice});
std::set<ur_device_handle_t> URDevicesSet;
std::transform(Devs.begin(), Devs.end(),
std::inserter(URDevicesSet, URDevicesSet.begin()),
[](const device &Dev) {
return getSyclObjImpl(Dev).get()->getHandleRef();
});
auto CacheKey =
std::make_pair(std::make_pair(SpecConsts, ImgId), URDevicesSet);

const ContextImplPtr &ContextImpl = getSyclObjImpl(Context);
KernelProgramCache &Cache = ContextImpl->getKernelProgramCache();
auto GetCachedBuildF = [&Cache, &CacheKey]() {
return Cache.getOrInsertProgram(CacheKey);
};

if (!SYCLConfig<SYCL_CACHE_IN_MEM>::get())
return BuildF();

auto BuildResult = Cache.getOrBuild<errc::build>(GetCachedBuildF, BuildF);
// getOrBuild is not supposed to return nullptr
assert(BuildResult != nullptr && "Invalid build result");

ur_program_handle_t ResProgram = BuildResult->Val;
auto Adapter = ContextImpl->getAdapter();

// Here we have multiple devices a program is built for, so add the program to
// the cache for all subsets of provided list of devices.
const AdapterPtr &Adapter = ContextImpl->getAdapter();
// If we linked any extra device images, then we need to
// cache them as well.
for (const RTDeviceBinaryImage *BImg : DeviceImagesToLink) {
// CacheKey is captured by reference by GetCachedBuildF, so we can simply
// update it here and re-use that lambda.
CacheKey.first.second = BImg->getImageID();
bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram);
if (DidInsert) {
// For every cached copy of the program, we need to increment its refcount
Adapter->call<UrApiKind::urProgramRetain>(ResProgram);
auto CacheLinkedImages = [&Adapter, &Cache, &CacheKey, &ResProgram,
&DeviceImagesToLink] {
for (const RTDeviceBinaryImage *BImg : DeviceImagesToLink) {
// CacheKey is captured by reference by GetCachedBuildF, so we can simply
// update it here and re-use that lambda.
CacheKey.first.second = BImg->getImageID();
bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram);
if (DidInsert) {
// For every cached copy of the program, we need to increment its
// refcount
Adapter->call<UrApiKind::urProgramRetain>(ResProgram);
}
}
};
CacheLinkedImages();

if (URDevicesSet.size() > 1) {
// emplace all subsets of the current set of devices into the cache.
// Set of all devices is not included in the loop as it was already added
// into the cache.
for (int Mask = 1; Mask < (1 << URDevicesSet.size()) - 1; ++Mask) {
std::set<ur_device_handle_t> Subset;
int Index = 0;
for (auto It = URDevicesSet.begin(); It != URDevicesSet.end();
++It, ++Index) {
if (Mask & (1 << Index)) {
Subset.insert(*It);
}
}
// Change device in the cache key to reduce copying of spec const data.
CacheKey.second = Subset;
bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram);
if (DidInsert) {
// For every cached copy of the program, we need to increment its
// refcount
Adapter->call<UrApiKind::urProgramRetain>(ResProgram);
}
CacheLinkedImages();
// getOrBuild is not supposed to return nullptr
assert(BuildResult != nullptr && "Invalid build result");
}
}

// If caching is enabled, one copy of the program handle will be
// stored in the cache, and one handle is returned to the
// caller. In that case, we need to increase the ref count of the
// program.
ContextImpl->getAdapter()->call<UrApiKind::urProgramRetain>(ResProgram);
Adapter->call<UrApiKind::urProgramRetain>(ResProgram);
return ResProgram;
}

// When caching is enabled, the returned UrProgram and UrKernel will
// already have their ref count incremented.
std::tuple<ur_kernel_handle_t, std::mutex *, const KernelArgMask *,
Expand Down Expand Up @@ -2437,8 +2477,6 @@ ProgramManager::compile(const device_image_plain &DeviceImage,
const AdapterPtr &Adapter =
getSyclObjImpl(InputImpl->get_context())->getAdapter();

// Device is not used when creating program from SPIRV, so passing only one
// device is OK.
ur_program_handle_t Prog = createURProgram(*InputImpl->get_bin_image_ref(),
InputImpl->get_context(), Devs);

Expand Down Expand Up @@ -2603,154 +2641,22 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage,
getSyclObjImpl(DeviceImage);

const context Context = InputImpl->get_context();

const ContextImplPtr ContextImpl = getSyclObjImpl(Context);

KernelProgramCache &Cache = ContextImpl->getKernelProgramCache();

std::string CompileOpts;
std::string LinkOpts;
applyOptionsFromEnvironment(CompileOpts, LinkOpts);

const RTDeviceBinaryImage *ImgPtr = InputImpl->get_bin_image_ref();
const RTDeviceBinaryImage &Img = *ImgPtr;

SerializedObj SpecConsts = InputImpl->get_spec_const_blob_ref();

// TODO: Unify this code with getBuiltPIProgram
auto BuildF = [this, &Context, &Img, &Devs, &CompileOpts, &LinkOpts,
&InputImpl, SpecConsts] {
ContextImplPtr ContextImpl = getSyclObjImpl(Context);
const AdapterPtr &Adapter = ContextImpl->getAdapter();
applyOptionsFromImage(CompileOpts, LinkOpts, Img, Devs, Adapter);
// Should always come last!
appendCompileEnvironmentVariablesThatAppend(CompileOpts);
appendLinkEnvironmentVariablesThatAppend(LinkOpts);

// Device is not used when creating program from SPIRV, so passing only one
// device is OK.
auto [NativePrg, DeviceCodeWasInCache] = getOrCreateURProgram(
Img, {&Img}, Context, Devs, CompileOpts + LinkOpts, SpecConsts);

if (!DeviceCodeWasInCache &&
InputImpl->get_bin_image_ref()->supportsSpecConstants())
setSpecializationConstants(InputImpl, NativePrg, Adapter);

UrFuncInfo<UrApiKind::urProgramRelease> programReleaseInfo;
auto programRelease =
programReleaseInfo.getFuncPtrFromModule(ur::getURLoaderLibrary());
ProgramPtr ProgramManaged(NativePrg, programRelease);

// Link a fallback implementation of device libraries if they are not
// supported by a device compiler.
// Pre-compiled programs are supposed to be already linked.
// If device image is not SPIR-V, DeviceLibReqMask will be 0 which means
// no fallback device library will be linked.
uint32_t DeviceLibReqMask = 0;
if (Img.getFormat() == SYCL_DEVICE_BINARY_TYPE_SPIRV &&
!SYCLConfig<SYCL_DEVICELIB_NO_FALLBACK>::get())
DeviceLibReqMask = getDeviceLibReqMask(Img);

// TODO: Add support for dynamic linking with kernel bundles
std::vector<ur_program_handle_t> ExtraProgramsToLink;
std::vector<ur_device_handle_t> URDevices;
for (auto Dev : Devs) {
URDevices.push_back(getSyclObjImpl(Dev).get()->getHandleRef());
}
ProgramPtr BuiltProgram =
build(std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts,
URDevices, DeviceLibReqMask, ExtraProgramsToLink);

emitBuiltProgramInfo(BuiltProgram.get(), ContextImpl);

{
std::lock_guard<std::mutex> Lock(MNativeProgramsMutex);
NativePrograms.insert({BuiltProgram.get(), &Img});
}

ContextImpl->addDeviceGlobalInitializer(BuiltProgram.get(), Devs, &Img);

// Save program to persistent cache if it is not there
if (!DeviceCodeWasInCache)
PersistentDeviceCodeCache::putItemToDisc(
Devs, {&Img}, SpecConsts, CompileOpts + LinkOpts, BuiltProgram.get());

return BuiltProgram.release();
};

if (!SYCLConfig<SYCL_CACHE_IN_MEM>::get()) {
auto ResProgram = BuildF();
DeviceImageImplPtr ExecImpl = std::make_shared<detail::device_image_impl>(
InputImpl->get_bin_image_ref(), Context, Devs, bundle_state::executable,
InputImpl->get_kernel_ids_ptr(), ResProgram,
InputImpl->get_spec_const_data_ref(),
InputImpl->get_spec_const_blob_ref());

return createSyclObjFromImpl<device_image_plain>(ExecImpl);
}

uint32_t ImgId = Img.getImageID();
std::set<ur_device_handle_t> URDevicesSet;
std::transform(Devs.begin(), Devs.end(),
std::inserter(URDevicesSet, URDevicesSet.begin()),
[](const device &Dev) {
return getSyclObjImpl(Dev).get()->getHandleRef();
});
auto CacheKey = std::make_pair(std::make_pair(std::move(SpecConsts), ImgId),
URDevicesSet);

// CacheKey is captured by reference so when we overwrite it later we can
// reuse this function.
auto GetCachedBuildF = [&Cache, &CacheKey]() {
return Cache.getOrInsertProgram(CacheKey);
};

auto BuildResult = Cache.getOrBuild<errc::build>(GetCachedBuildF, BuildF);
// getOrBuild is not supposed to return nullptr
assert(BuildResult != nullptr && "Invalid build result");

ur_program_handle_t ResProgram = BuildResult->Val;

// Here we have multiple devices a program is built for, so add the program to
// the cache for all subsets of provided list of devices.
const AdapterPtr &Adapter = ContextImpl->getAdapter();
auto CacheSubsets = [ResProgram, &Adapter]() {
Adapter->call<UrApiKind::urProgramRetain>(ResProgram);
return ResProgram;
};

if (URDevicesSet.size() > 1) {
// emplace all subsets of the current set of devices into the cache.
// Set of all devices is not included in the loop as it was already added
// into the cache.
for (int Mask = 1; Mask < (1 << URDevicesSet.size()) - 1; ++Mask) {
std::set<ur_device_handle_t> Subset;
int Index = 0;
for (auto It = URDevicesSet.begin(); It != URDevicesSet.end();
++It, ++Index) {
if (Mask & (1 << Index)) {
Subset.insert(*It);
}
}
// Change device in the cache key to reduce copying of spec const data.
CacheKey.second = Subset;
Cache.getOrBuild<errc::build>(GetCachedBuildF, CacheSubsets);
// getOrBuild is not supposed to return nullptr
assert(BuildResult != nullptr && "Invalid build result");
}
}

// devive_image_impl shares ownership of PIProgram with, at least, program
// cache. The ref counter will be descremented in the destructor of
// device_image_impl
Adapter->call<UrApiKind::urProgramRetain>(ResProgram);

// TODO: Add support for dynamic linking with kernel bundles
ur_program_handle_t ResProgram =
getBuiltURProgram(Img, Context, Devs, /*DeviceImagesToLink*/ {}, {&Img},
InputImpl, SpecConsts);
DeviceImageImplPtr ExecImpl = std::make_shared<detail::device_image_impl>(
InputImpl->get_bin_image_ref(), Context, Devs, bundle_state::executable,
InputImpl->get_kernel_ids_ptr(), ResProgram,
InputImpl->get_spec_const_data_ref(),
InputImpl->get_spec_const_blob_ref());

return createSyclObjFromImpl<device_image_plain>(ExecImpl);
}

Expand Down
Loading
Loading