Skip to content

Commit ce29b77

Browse files
[SYCL] Fixed bug regarding device caching (#2566)
Created a new function, populateDeviceCacheIfNeeded, so that cached devices could be shared across both piDevicesGet and piextDeviceCreateWithNativeHandle. This new function will check/fill and return cached devices. Renamed getOrCreatePlatforms to getPlatformCache, and refactored the function so that it limits level zero driver calls and removes redundant code. Called getOrMakePlatformImpl() from make_platform() so that the PlatformImpl cache is accessible to platforms that are created using a native handle. Called getOrMakeDeviceImpl() from make_device() so that the DeviceImpl cache is accessible to devices that are created using a native handle. Also, added an E2E test for these changes.
1 parent 126338c commit ce29b77

File tree

4 files changed

+327
-118
lines changed

4 files changed

+327
-118
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 164 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -641,8 +641,7 @@ static pi_result copyModule(ze_context_handle_t ZeContext,
641641

642642
static bool setEnvVar(const char *var, const char *value);
643643

644-
static pi_result getOrCreatePlatform(ze_driver_handle_t ZeDriver,
645-
pi_platform *Platform);
644+
static pi_result populateDeviceCacheIfNeeded(pi_platform Platform);
646645

647646
// Forward declarations for mock implementations of Level Zero APIs that
648647
// do not yet work in the driver.
@@ -730,40 +729,22 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
730729
return mapError(ZeResult);
731730
}
732731

733-
// Level Zero does not have concept of Platforms, but Level Zero driver is the
734-
// closest match.
735-
if (Platforms && NumEntries > 0) {
736-
uint32_t ZeDriverCount = 0;
737-
ZE_CALL(zeDriverGet(&ZeDriverCount, nullptr));
738-
if (ZeDriverCount == 0) {
739-
assert(NumPlatforms != 0);
740-
*NumPlatforms = 0;
741-
return PI_SUCCESS;
742-
}
743-
ze_driver_handle_t ZeDriver;
744-
assert(ZeDriverCount == 1);
745-
ZE_CALL(zeDriverGet(&ZeDriverCount, &ZeDriver));
746-
747-
pi_result Res = getOrCreatePlatform(ZeDriver, Platforms);
748-
if (Res != PI_SUCCESS) {
749-
return Res;
750-
}
751-
}
752-
753-
if (NumPlatforms)
754-
*NumPlatforms = 1;
755-
756-
return PI_SUCCESS;
757-
}
758-
759-
// Retrieve a cached Platform that has a matching driver handle or use the
760-
// driver handle to create and initialize a new Platform.
761-
static pi_result getOrCreatePlatform(ze_driver_handle_t ZeDriver,
762-
pi_platform *Platform) {
763-
764-
// We will retrieve the Max CommandList Cache in this lamda function so that
765-
// it only has to be executed once
766-
static pi_uint32 CommandListCacheSizeValue = ([] {
732+
// Cache pi_platforms for reuse in the future
733+
// It solves two problems;
734+
// 1. sycl::platform equality issue; we always return the same pi_platform.
735+
// 2. performance; we can save time by immediately return from cache.
736+
//
737+
// Note: The memory for "PiPlatformsCache" and "PiPlatformsCacheMutex" is
738+
// intentionally leaked because the application may call into the SYCL
739+
// runtime from a global destructor, and such a call could eventually
740+
// access these variables. Therefore, there is no safe time when
741+
// "PiPlatformsCache" and "PiPlatformsCacheMutex" could be deleted.
742+
static auto PiPlatformsCache = new std::vector<pi_platform>;
743+
static auto PiPlatformsCacheMutex = new std::mutex;
744+
static bool PiPlatformCachePopulated = false;
745+
746+
std::lock_guard<std::mutex> Lock(*PiPlatformsCacheMutex);
747+
if (!PiPlatformCachePopulated) {
767748
const char *CommandListCacheSize =
768749
std::getenv("SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE");
769750
pi_uint32 CommandListCacheSizeValue;
@@ -776,62 +757,69 @@ static pi_result getOrCreatePlatform(ze_driver_handle_t ZeDriver,
776757
"default set.\n");
777758
CommandListCacheSizeValue = 20000;
778759
}
779-
return CommandListCacheSizeValue;
780-
})();
781760

782-
try {
783-
// Cache pi_platforms for reuse in the future
784-
// It solves two problems;
785-
// 1. sycl::device equality issue; we always return the same pi_device.
786-
// 2. performance; we can save time by immediately return from cache.
787-
//
788-
// Note: The memory for "PiPlatformsCache" and "PiPlatformsCacheMutex" is
789-
// intentionally leaked because the application may call into the SYCL
790-
// runtime from a global destructor, and such a call could eventually
791-
// access these variables. Therefore, there is no safe time when
792-
// "PiPlatformsCache" and "PiPlatformsCacheMutex" could be deleted.
793-
static auto PiPlatformsCache = new std::vector<pi_platform>;
794-
static auto PiPlatformsCacheMutex = new std::mutex;
795-
796-
std::lock_guard<std::mutex> Lock(*PiPlatformsCacheMutex);
797-
for (const pi_platform &CachedPlatform : *PiPlatformsCache) {
798-
if (CachedPlatform->ZeDriver == ZeDriver) {
799-
Platform[0] = CachedPlatform;
800-
return PI_SUCCESS;
761+
try {
762+
763+
// Level Zero does not have concept of Platforms, but Level Zero driver is
764+
// the closest match.
765+
uint32_t ZeDriverCount = 0;
766+
ZE_CALL(zeDriverGet(&ZeDriverCount, nullptr));
767+
if (ZeDriverCount == 0) {
768+
PiPlatformCachePopulated = true;
769+
} else {
770+
ze_driver_handle_t ZeDriver;
771+
assert(ZeDriverCount == 1);
772+
ZE_CALL(zeDriverGet(&ZeDriverCount, &ZeDriver));
773+
pi_platform Platform = new _pi_platform(ZeDriver);
774+
775+
// Cache driver properties
776+
ze_driver_properties_t ZeDriverProperties;
777+
ZE_CALL(zeDriverGetProperties(ZeDriver, &ZeDriverProperties));
778+
uint32_t ZeDriverVersion = ZeDriverProperties.driverVersion;
779+
// Intel Level-Zero GPU driver stores version as:
780+
// | 31 - 24 | 23 - 16 | 15 - 0 |
781+
// | Major | Minor | Build |
782+
auto VersionMajor =
783+
std::to_string((ZeDriverVersion & 0xFF000000) >> 24);
784+
auto VersionMinor =
785+
std::to_string((ZeDriverVersion & 0x00FF0000) >> 16);
786+
auto VersionBuild = std::to_string(ZeDriverVersion & 0x0000FFFF);
787+
Platform->ZeDriverVersion =
788+
VersionMajor + "." + VersionMinor + "." + VersionBuild;
789+
790+
ze_api_version_t ZeApiVersion;
791+
ZE_CALL(zeDriverGetApiVersion(ZeDriver, &ZeApiVersion));
792+
Platform->ZeDriverApiVersion =
793+
std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
794+
std::to_string(ZE_MINOR_VERSION(ZeApiVersion));
795+
796+
Platform->ZeMaxCommandListCache = CommandListCacheSizeValue;
797+
// Save a copy in the cache for future uses.
798+
PiPlatformsCache->push_back(Platform);
799+
PiPlatformCachePopulated = true;
801800
}
801+
} catch (const std::bad_alloc &) {
802+
return PI_OUT_OF_HOST_MEMORY;
803+
} catch (...) {
804+
return PI_ERROR_UNKNOWN;
802805
}
806+
}
803807

804-
// TODO: figure out how/when to release this memory
805-
*Platform = new _pi_platform(ZeDriver);
806-
807-
// Cache driver properties
808-
ze_driver_properties_t ZeDriverProperties;
809-
ZE_CALL(zeDriverGetProperties(ZeDriver, &ZeDriverProperties));
810-
uint32_t ZeDriverVersion = ZeDriverProperties.driverVersion;
811-
// Intel Level-Zero GPU driver stores version as:
812-
// | 31 - 24 | 23 - 16 | 15 - 0 |
813-
// | Major | Minor | Build |
814-
auto VersionMajor = std::to_string((ZeDriverVersion & 0xFF000000) >> 24);
815-
auto VersionMinor = std::to_string((ZeDriverVersion & 0x00FF0000) >> 16);
816-
auto VersionBuild = std::to_string(ZeDriverVersion & 0x0000FFFF);
817-
Platform[0]->ZeDriverVersion =
818-
VersionMajor + "." + VersionMinor + "." + VersionBuild;
819-
820-
ze_api_version_t ZeApiVersion;
821-
ZE_CALL(zeDriverGetApiVersion(ZeDriver, &ZeApiVersion));
822-
Platform[0]->ZeDriverApiVersion =
823-
std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
824-
std::to_string(ZE_MINOR_VERSION(ZeApiVersion));
825-
826-
Platform[0]->ZeMaxCommandListCache = CommandListCacheSizeValue;
827-
// save a copy in the cache for future uses.
828-
PiPlatformsCache->push_back(Platform[0]);
829-
} catch (const std::bad_alloc &) {
830-
return PI_OUT_OF_HOST_MEMORY;
831-
} catch (...) {
832-
return PI_ERROR_UNKNOWN;
808+
if (Platforms && NumEntries > 0) {
809+
uint32_t I = 0;
810+
for (const pi_platform &CachedPlatform : *PiPlatformsCache) {
811+
if (I < NumEntries) {
812+
*Platforms++ = CachedPlatform;
813+
I++;
814+
} else {
815+
break;
816+
}
817+
}
833818
}
834819

820+
if (NumPlatforms)
821+
*NumPlatforms = PiPlatformsCache->size();
822+
835823
return PI_SUCCESS;
836824
}
837825

@@ -900,10 +888,35 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
900888
assert(NativeHandle);
901889
assert(Platform);
902890

903-
// Create PI platform from the given Level Zero driver handle or retrieve it
904-
// from the cache.
905891
auto ZeDriver = pi_cast<ze_driver_handle_t>(NativeHandle);
906-
return getOrCreatePlatform(ZeDriver, Platform);
892+
893+
pi_uint32 NumPlatforms = 0;
894+
pi_result Res = piPlatformsGet(0, nullptr, &NumPlatforms);
895+
if (Res != PI_SUCCESS) {
896+
return Res;
897+
}
898+
899+
if (NumPlatforms) {
900+
std::vector<pi_platform> Platforms(NumPlatforms);
901+
Res = piPlatformsGet(NumPlatforms, Platforms.data(), nullptr);
902+
if (Res != PI_SUCCESS) {
903+
return Res;
904+
}
905+
906+
// The SYCL spec requires that the set of platforms must remain fixed for
907+
// the duration of the application's execution. We assume that we found all
908+
// of the Level Zero drivers when we initialized the platform cache, so the
909+
// "NativeHandle" must already be in the cache. If it is not, this must not
910+
// be a valid Level Zero driver.
911+
for (const pi_platform &CachedPlatform : Platforms) {
912+
if (CachedPlatform->ZeDriver == ZeDriver) {
913+
*Platform = CachedPlatform;
914+
return PI_SUCCESS;
915+
}
916+
}
917+
}
918+
919+
return PI_INVALID_VALUE;
907920
}
908921

909922
// Get the cahched PI device created for the L0 device handle.
@@ -912,9 +925,11 @@ pi_device _pi_platform::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {
912925

913926
std::lock_guard<std::mutex> Lock(this->PiDevicesCacheMutex);
914927
auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(),
915-
[&](pi_device &D) { return D->ZeDevice == ZeDevice; });
928+
[&](std::unique_ptr<_pi_device> &D) {
929+
return D.get()->ZeDevice == ZeDevice;
930+
});
916931
if (it != PiDevicesCache.end()) {
917-
return *it;
932+
return (*it).get();
918933
}
919934
return nullptr;
920935
}
@@ -924,20 +939,20 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
924939
pi_uint32 *NumDevices) {
925940

926941
assert(Platform);
927-
ze_driver_handle_t ZeDriver = Platform->ZeDriver;
928942

929943
// Get number of devices supporting Level Zero
930944
uint32_t ZeDeviceCount = 0;
931945
std::lock_guard<std::mutex> Lock(Platform->PiDevicesCacheMutex);
932-
ZeDeviceCount = Platform->PiDevicesCache.size();
933946

947+
pi_result Res = populateDeviceCacheIfNeeded(Platform);
948+
if (Res != PI_SUCCESS) {
949+
return Res;
950+
}
951+
952+
ZeDeviceCount = Platform->PiDevicesCache.size();
934953
const bool AskingForGPU = (DeviceType & PI_DEVICE_TYPE_GPU);
935954
const bool AskingForDefault = (DeviceType == PI_DEVICE_TYPE_DEFAULT);
936955

937-
if (ZeDeviceCount == 0) {
938-
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr));
939-
}
940-
941956
if (ZeDeviceCount == 0 || !(AskingForGPU || AskingForDefault)) {
942957
if (NumDevices)
943958
*NumDevices = 0;
@@ -953,34 +968,53 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
953968
return PI_SUCCESS;
954969
}
955970

956-
// if devices are already captured in cache, return them from the cache.
957-
for (const pi_device CachedDevice : Platform->PiDevicesCache) {
958-
*Devices++ = CachedDevice;
971+
// Return the devices from the cache.
972+
uint32_t I = 0;
973+
for (const std::unique_ptr<_pi_device> &CachedDevice :
974+
Platform->PiDevicesCache) {
975+
if (I < NumEntries) {
976+
*Devices++ = CachedDevice.get();
977+
I++;
978+
} else {
979+
break;
980+
}
959981
}
960-
if (!Platform->PiDevicesCache.empty()) {
982+
983+
return PI_SUCCESS;
984+
}
985+
986+
// Check the device cache and load it if necessary. The PiDevicesCacheMutex must
987+
// be locked before calling this function to prevent any synchronization issues.
988+
static pi_result populateDeviceCacheIfNeeded(pi_platform Platform) {
989+
990+
if (Platform->DeviceCachePopulated) {
961991
return PI_SUCCESS;
962992
}
963993

994+
ze_driver_handle_t ZeDriver = Platform->ZeDriver;
995+
uint32_t ZeDeviceCount = 0;
996+
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr));
997+
964998
try {
965999
std::vector<ze_device_handle_t> ZeDevices(ZeDeviceCount);
9661000
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, ZeDevices.data()));
9671001

9681002
for (uint32_t I = 0; I < ZeDeviceCount; ++I) {
969-
if (I < NumEntries) {
970-
Devices[I] = new _pi_device(ZeDevices[I], Platform);
971-
pi_result Result = Devices[I]->initialize();
972-
if (Result != PI_SUCCESS) {
973-
return Result;
974-
}
975-
// save a copy in the cache for future uses.
976-
Platform->PiDevicesCache.push_back(Devices[I]);
1003+
std::unique_ptr<_pi_device> Device(
1004+
new _pi_device(ZeDevices[I], Platform));
1005+
pi_result Result = Device->initialize();
1006+
if (Result != PI_SUCCESS) {
1007+
return Result;
9771008
}
1009+
// save a copy in the cache for future uses.
1010+
Platform->PiDevicesCache.push_back(std::move(Device));
9781011
}
9791012
} catch (const std::bad_alloc &) {
9801013
return PI_OUT_OF_HOST_MEMORY;
9811014
} catch (...) {
9821015
return PI_ERROR_UNKNOWN;
9831016
}
1017+
Platform->DeviceCachePopulated = true;
9841018
return PI_SUCCESS;
9851019
}
9861020

@@ -1583,11 +1617,28 @@ pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
15831617
assert(Device);
15841618
assert(Platform);
15851619

1586-
// Create PI device from the given Level Zero device handle.
1587-
// TODO: get the device from the devices' cache.
1620+
std::lock_guard<std::mutex> Lock(Platform->PiDevicesCacheMutex);
1621+
pi_result Res = populateDeviceCacheIfNeeded(Platform);
1622+
if (Res != PI_SUCCESS) {
1623+
return Res;
1624+
}
1625+
15881626
auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
1589-
*Device = new _pi_device(ZeDevice, Platform);
1590-
return (*Device)->initialize();
1627+
1628+
// The SYCL spec requires that the set of devices must remain fixed for the
1629+
// duration of the application's execution. We assume that we found all of the
1630+
// Level Zero devices when we initialized the device cache, so the
1631+
// "NativeHandle" must already be in the cache. If it is not, this must not be
1632+
// a valid Level Zero device.
1633+
for (const std::unique_ptr<_pi_device> &CachedDevice :
1634+
Platform->PiDevicesCache) {
1635+
if (CachedDevice->ZeDevice == ZeDevice) {
1636+
*Device = CachedDevice.get();
1637+
return PI_SUCCESS;
1638+
}
1639+
}
1640+
1641+
return PI_INVALID_VALUE;
15911642
}
15921643

15931644
pi_result piContextCreate(const pi_context_properties *Properties,

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,10 @@ struct _pi_platform {
7878
std::string ZeDriverApiVersion;
7979

8080
// Cache pi_devices for reuse
81-
std::vector<pi_device> PiDevicesCache;
81+
std::vector<std::unique_ptr<_pi_device>> PiDevicesCache;
8282
std::mutex PiDevicesCacheMutex;
8383
pi_device getDeviceFromNativeHandle(ze_device_handle_t);
84+
bool DeviceCachePopulated = false;
8485

8586
// Maximum Number of Command Lists that can be created.
8687
// This Value is initialized to 20000, but can be changed by the user

0 commit comments

Comments
 (0)