|
| 1 | +# Release Notes v1.1 |
| 2 | + |
| 3 | +Level Zero Core API. |
| 4 | + |
| 5 | +April 2021 |
| 6 | + |
| 7 | +## Changes in this release: |
| 8 | + |
| 9 | +### Device allocations larger than 4GB size. |
| 10 | +https://spec.oneapi.com/level-zero/latest/core/api.html?highlight=relaxed#relaxedalloclimits-enums |
| 11 | + |
| 12 | +L0 driver now allows the allocation of buffers larger than 4GB. To use, the `ze_relaxed_allocation_limits_exp_desc_t` |
| 13 | +structure needs to be passed to `zeMemAllocHost` or `zeMemAllocShared` as a linked descriptor. |
| 14 | + |
| 15 | +Sample code: |
| 16 | + |
| 17 | +```cpp |
| 18 | +ze_relaxed_allocation_limits_exp_desc_t relaxedDesc = {}; |
| 19 | +relaxedDesc.stype = ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC; |
| 20 | +relaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; |
| 21 | + |
| 22 | +ze_device_mem_alloc_desc_t deviceDesc = {}; |
| 23 | +deviceDesc.pNext = &relaxedDesc; |
| 24 | +zeMemAllocDevice(context, &deviceDesc, size, 0, device, &ptr); |
| 25 | +``` |
| 26 | +
|
| 27 | +In addition to this, kernels need to be compiled with `ze-opt-greater-than-4GB-buffer-required`. This needs to be |
| 28 | +passed in `pBuildFlags` field in `ze_module_desc_t` descriptor while calling `zeModuleCreate`. |
| 29 | +
|
| 30 | +### zeDeviceGetGlobalTimestamps for CPU/GPU synchronized time. |
| 31 | +https://spec.oneapi.com/level-zero/latest/core/api.html?highlight=zedevicegetglobaltimestamps#_CPPv427zeDeviceGetGlobalTimestamps18ze_device_handle_tP8uint64_tP8uint64_t |
| 32 | +
|
| 33 | +Returns synchronized Host and device global timestamps. |
| 34 | +
|
| 35 | +Sample code: |
| 36 | +
|
| 37 | +```cpp |
| 38 | +ze_relaxed_allocation_limits_exp_desc_t relaxedDesc = {}; |
| 39 | +relaxedDesc.stype = ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC; |
| 40 | +relaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; |
| 41 | +
|
| 42 | +ze_device_mem_alloc_desc_t deviceDesc = {}; |
| 43 | +deviceDesc.pNext = &relaxedDesc; |
| 44 | +zeMemAllocDevice(context, &deviceDesc, size, 0, device, &ptr); |
| 45 | +``` |
| 46 | + |
| 47 | +### Global work offset |
| 48 | +https://spec.oneapi.com/level-zero/latest/core/api.html?highlight=globaloffset#_CPPv426zeKernelSetGlobalOffsetExp18ze_kernel_handle_t8uint32_t8uint32_t8uint32_t |
| 49 | + |
| 50 | +Applications now can set a global work offset to kernels. |
| 51 | + |
| 52 | +Sample code: |
| 53 | + |
| 54 | +```cpp |
| 55 | +... |
| 56 | +uint32_t groupSizeX = sizeX; |
| 57 | +uint32_t groupSizeY = 1u; |
| 58 | +uint32_t groupSizeZ = 1u; |
| 59 | +zeKernelSetGroupSize(kernel, groupSizeX, groupSizeY, groupSizeZ); |
| 60 | + |
| 61 | +uint32_t offsetx = offset; |
| 62 | +uint32_t offsety = 0; |
| 63 | +uint32_t offsetz = 0; |
| 64 | +zeKernelSetGlobalOffsetExp(kernel, offsetx, offsety, offsetz); |
| 65 | +... |
| 66 | +``` |
| 67 | +
|
| 68 | +### Atomic floating point properties |
| 69 | +https://spec.oneapi.com/level-zero/latest/core/api.html?highlight=ze_structure_type_float_atomic_ext_properties#_CPPv432ze_float_atomic_ext_properties_t |
| 70 | +
|
| 71 | +Applications now can query for floating atomic properties supported by the device in a kernel. |
| 72 | +This is done by passing `ze_float_atomic_ext_properties_t` to zeDeviceGetModuleProperties as a linked property structure. |
| 73 | +
|
| 74 | +Sample code: |
| 75 | +
|
| 76 | +```cpp |
| 77 | +ze_device_module_properties_t kernelProperties = {}; |
| 78 | +ze_float_atomic_ext_properties_t extendedProperties = {}; |
| 79 | +extendedProperties.stype = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES; |
| 80 | +kernelProperties.pNext = &extendedProperties; |
| 81 | +zeDeviceGetModuleProperties(hDevice, &kernelProperties); |
| 82 | +
|
| 83 | +if (extendedProperties.fp16Flags & ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD) { |
| 84 | + // kernel supports floating atomic add and subtract |
| 85 | +} |
| 86 | +``` |
| 87 | + |
| 88 | +### Context Creation for specific devices |
| 89 | +https://spec.oneapi.com/level-zero/latest/core/api.html?highlight=zecontextcreate#_CPPv417zeContextCreateEx18ze_driver_handle_tPK17ze_context_desc_t8uint32_tP18ze_device_handle_tP19ze_context_handle_t |
| 90 | + |
| 91 | +Added `zeContextCreateEX` to create a context with a set of devices. Resources allocated against that context |
| 92 | +are visible only to the devices for which the context was created. |
| 93 | + |
| 94 | +Sample code: |
| 95 | + |
| 96 | +```cpp |
| 97 | +std::vector<ze_device_handle_t> devices; |
| 98 | +devices.push_back(device0); |
| 99 | +devices.push_back(device1); |
| 100 | +... |
| 101 | +zeContextCreateEx(hDriver, &desc, devices.size(), devices.data(), &phContext); |
| 102 | +``` |
| 103 | +
|
| 104 | +### Change on timer resolution |
| 105 | +https://spec.oneapi.com/level-zero/latest/core/api.html?highlight=timerresolution#_CPPv4N22ze_device_properties_t15timerResolutionE |
| 106 | +
|
| 107 | +Time resolution returned by device properties has been changed to cycles/second (v1.0 has a resolution of nano-seconds). |
| 108 | +To help libraries with the transtition to the new resolution, the `UseCyclesPerSecondTimer` variable has been defined. |
| 109 | +When set to 1, the driver will return the resolution defined for v1.1 (cycles/second), otherwise, it will still |
| 110 | +return the resolution for v1.0 (nanoseconds). The use of this environment variable is only temporal while applications |
| 111 | +and libraries complete their transition to v1.1 and will be eventually eliminated, leaving the resolution for v1.1 as default. |
| 112 | +
|
| 113 | +When reading querying for the timere resolution, applications then need to keep in mind: |
| 114 | +
|
| 115 | +* If `ZE_API_VERSION_1_0` returned by `zeDriverGetApiVersion`: Timer resolution is nanoseconds. |
| 116 | +* If `ZE_API_VERSION_1_1` returned by `zeDriverGetApiVersion`: Timer resolution is nanoseconds, as in v1.0. |
| 117 | +* If `ZE_API_VERSION_1_1` returned by `zeDriverGetApiVersion` and `UseCyclesPerSecondTimer=1`: Timer resolution is cycles per seconds, as in v1.1. |
| 118 | +
|
| 119 | +Note: In Release builds, `NEOReadDebugKeys=1` may be needed to read environment variables. To confirm the L0 driver is |
| 120 | +reading the environment variables, please use `PrintDebugSettings=1`, which will print them at the beginning of the |
| 121 | +application. See below: |
| 122 | +
|
| 123 | +```sh |
| 124 | +$ PrintDebugSettings=1 UseCyclesPerSecondTimer=1 ./zello_world_gpu |
| 125 | +Non-default value of debug variable: PrintDebugSettings = 1 |
| 126 | +Non-default value of debug variable: UseCyclesPerSecondTimer = 1 |
| 127 | +... |
| 128 | +``` |
| 129 | + |
| 130 | +Sample code: |
| 131 | + |
| 132 | +if `UseCyclesPerSecondTimer=1` set |
| 133 | + |
| 134 | +```cpp |
| 135 | +ze_api_version_t version; |
| 136 | +zeDriverGetApiVersion(hDriver, &version); |
| 137 | +... |
| 138 | +ze_device_properties_t devProperties = {}; |
| 139 | +zeDeviceGetProperties(device, &devProperties); |
| 140 | + |
| 141 | +if (version == ZE_API_VERSION_1_1) { |
| 142 | + uint64_t timerResolutionInCyclesPerSecond = devProperties.timerResolution; |
| 143 | +} else { |
| 144 | + uint64_t timerResolutionInNanoSeconds = devProperties.timerResolution; |
| 145 | +} |
| 146 | + |
| 147 | +... |
| 148 | +``` |
| 149 | +
|
| 150 | +if `UseCyclesPerSecondTimer` not set |
| 151 | +
|
| 152 | +```cpp |
| 153 | +ze_api_version_t version; |
| 154 | +zeDriverGetApiVersion(hDriver, &version); |
| 155 | +... |
| 156 | +ze_device_properties_t devProperties = {}; |
| 157 | +zeDeviceGetProperties(device, &devProperties); |
| 158 | +
|
| 159 | +uint64_t timerResolutionInNanoSeconds = devProperties.timerResolution; |
| 160 | +... |
| 161 | +``` |
0 commit comments