Skip to content

Commit d5eb268

Browse files
author
Hugh Delaney
authored
Merge branch 'main' into tensormap-exp-api
2 parents 9445457 + ab0a706 commit d5eb268

File tree

17 files changed

+350
-74
lines changed

17 files changed

+350
-74
lines changed

.github/codeql/codeql-config.yml

Lines changed: 0 additions & 2 deletions
This file was deleted.

.github/workflows/codeql.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ jobs:
2727
uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
2828
with:
2929
languages: cpp, python
30-
config-file: ./.github/codeql/codeql-config.yml
3130

3231
- name: Install pip packages
3332
run: pip install -r third_party/requirements.txt

.github/workflows/trivy.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ jobs:
3535
format: 'sarif'
3636
output: 'trivy-results.sarif'
3737
exit-code: 1 # Fail if issue found
38-
skip-dirs: '**/_deps/**'
3938
# file with suppressions: .trivyignore (in root dir)
4039

4140
- name: Print report and trivyignore file

cmake/FetchLevelZero.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
4040
set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
4141
endif()
4242
if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "")
43-
set(UR_LEVEL_ZERO_LOADER_TAG v1.18.3)
43+
set(UR_LEVEL_ZERO_LOADER_TAG v1.17.39)
4444
endif()
4545

4646
# Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104

source/adapters/opencl/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ add_ur_adapter(${TARGET_NAME} SHARED
3838
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
3939
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
4040
${CMAKE_CURRENT_SOURCE_DIR}/tensor_map.cpp
41+
${CMAKE_CURRENT_SOURCE_DIR}/usm.hpp
4142
${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
4243
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
4344
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp

source/adapters/opencl/enqueue.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
3030
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
3131
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
3232
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
33+
std::vector<size_t> compiledLocalWorksize;
34+
if (!pLocalWorkSize) {
35+
cl_device_id device = nullptr;
36+
CL_RETURN_ON_FAILURE(clGetCommandQueueInfo(
37+
cl_adapter::cast<cl_command_queue>(hQueue), CL_QUEUE_DEVICE,
38+
sizeof(device), &device, nullptr));
39+
// This query always returns size_t[3], if nothing was specified it returns
40+
// all zeroes.
41+
size_t queriedLocalWorkSize[3] = {0, 0, 0};
42+
CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo(
43+
cl_adapter::cast<cl_kernel>(hKernel), device,
44+
CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(size_t[3]),
45+
queriedLocalWorkSize, nullptr));
46+
if (queriedLocalWorkSize[0] != 0) {
47+
for (uint32_t i = 0; i < workDim; i++) {
48+
compiledLocalWorksize.push_back(queriedLocalWorkSize[i]);
49+
}
50+
}
51+
}
3352

3453
CL_RETURN_ON_FAILURE(clEnqueueNDRangeKernel(
3554
cl_adapter::cast<cl_command_queue>(hQueue),
3655
cl_adapter::cast<cl_kernel>(hKernel), workDim, pGlobalWorkOffset,
37-
pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList,
38-
cl_adapter::cast<const cl_event *>(phEventWaitList),
56+
pGlobalWorkSize,
57+
compiledLocalWorksize.empty() ? pLocalWorkSize
58+
: compiledLocalWorksize.data(),
59+
numEventsInWaitList, cl_adapter::cast<const cl_event *>(phEventWaitList),
3960
cl_adapter::cast<cl_event *>(phEvent)));
4061

4162
return UR_RESULT_SUCCESS;

source/adapters/opencl/program.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(
8181

8282
*phProgram = cl_adapter::cast<ur_program_handle_t>(clCreateProgramWithIL(
8383
cl_adapter::cast<cl_context>(hContext), pIL, length, &Err));
84-
CL_RETURN_ON_FAILURE(Err);
8584
} else {
8685

8786
/* If none of the devices conform with CL 2.1 or newer make sure they all
@@ -109,6 +108,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(
109108

110109
*phProgram = cl_adapter::cast<ur_program_handle_t>(
111110
FuncPtr(cl_adapter::cast<cl_context>(hContext), pIL, length, &Err));
111+
}
112+
113+
// INVALID_VALUE is only returned in three circumstances according to the cl
114+
// spec:
115+
// * pIL == NULL
116+
// * length == 0
117+
// * pIL is not a well-formed binary
118+
// UR has a unique error code for each of these, so here we figure out which
119+
// to return
120+
if (Err == CL_INVALID_VALUE) {
121+
if (pIL == nullptr) {
122+
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
123+
}
124+
if (length == 0) {
125+
return UR_RESULT_ERROR_INVALID_SIZE;
126+
}
127+
return UR_RESULT_ERROR_INVALID_BINARY;
128+
} else {
112129
CL_RETURN_ON_FAILURE(Err);
113130
}
114131

source/adapters/opencl/usm.cpp

Lines changed: 140 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@
1111
#include <ur/ur.hpp>
1212

1313
#include "common.hpp"
14+
#include "usm.hpp"
15+
16+
template <class T>
17+
void AllocDeleterCallback(cl_event event, cl_int, void *pUserData) {
18+
clReleaseEvent(event);
19+
auto Info = static_cast<T *>(pUserData);
20+
delete Info;
21+
}
1422

1523
namespace umf {
1624
ur_result_t getProviderNativeError(const char *, int32_t) {
@@ -312,32 +320,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
312320
numEventsInWaitList, cl_adapter::cast<const cl_event *>(phEventWaitList),
313321
&CopyEvent));
314322

315-
struct DeleteCallbackInfo {
316-
DeleteCallbackInfo(clMemBlockingFreeINTEL_fn USMFree, cl_context CLContext,
317-
void *HostBuffer)
318-
: USMFree(USMFree), CLContext(CLContext), HostBuffer(HostBuffer) {
319-
clRetainContext(CLContext);
320-
}
321-
~DeleteCallbackInfo() {
322-
USMFree(CLContext, HostBuffer);
323-
clReleaseContext(CLContext);
324-
}
325-
DeleteCallbackInfo(const DeleteCallbackInfo &) = delete;
326-
DeleteCallbackInfo &operator=(const DeleteCallbackInfo &) = delete;
327-
328-
clMemBlockingFreeINTEL_fn USMFree;
329-
cl_context CLContext;
330-
void *HostBuffer;
331-
};
332-
333-
auto Info = new DeleteCallbackInfo(USMFree, CLContext, HostBuffer);
323+
if (phEvent) {
324+
// Since we're releasing this in the callback above we need to retain it
325+
// here to keep the user copy alive.
326+
CL_RETURN_ON_FAILURE(clRetainEvent(CopyEvent));
327+
*phEvent = cl_adapter::cast<ur_event_handle_t>(CopyEvent);
328+
}
334329

335-
auto DeleteCallback = [](cl_event, cl_int, void *pUserData) {
336-
auto Info = static_cast<DeleteCallbackInfo *>(pUserData);
337-
delete Info;
338-
};
330+
// This self destructs taking the event and allocation with it.
331+
auto Info = new AllocDeleterCallbackInfo(USMFree, CLContext, HostBuffer);
339332

340-
ClErr = clSetEventCallback(CopyEvent, CL_COMPLETE, DeleteCallback, Info);
333+
ClErr =
334+
clSetEventCallback(CopyEvent, CL_COMPLETE,
335+
AllocDeleterCallback<AllocDeleterCallbackInfo>, Info);
341336
if (ClErr != CL_SUCCESS) {
342337
// We can attempt to recover gracefully by attempting to wait for the copy
343338
// to finish and deleting the info struct here.
@@ -346,11 +341,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
346341
clReleaseEvent(CopyEvent);
347342
CL_RETURN_ON_FAILURE(ClErr);
348343
}
349-
if (phEvent) {
350-
*phEvent = cl_adapter::cast<ur_event_handle_t>(CopyEvent);
351-
} else {
352-
CL_RETURN_ON_FAILURE(clReleaseEvent(CopyEvent));
353-
}
354344

355345
return UR_RESULT_SUCCESS;
356346
}
@@ -369,20 +359,131 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
369359
return mapCLErrorToUR(CLErr);
370360
}
371361

372-
clEnqueueMemcpyINTEL_fn FuncPtr = nullptr;
373-
ur_result_t RetVal = cl_ext::getExtFuncFromContext<clEnqueueMemcpyINTEL_fn>(
362+
clGetMemAllocInfoINTEL_fn GetMemAllocInfo = nullptr;
363+
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clGetMemAllocInfoINTEL_fn>(
364+
CLContext, cl_ext::ExtFuncPtrCache->clGetMemAllocInfoINTELCache,
365+
cl_ext::GetMemAllocInfoName, &GetMemAllocInfo));
366+
367+
clEnqueueMemcpyINTEL_fn USMMemcpy = nullptr;
368+
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clEnqueueMemcpyINTEL_fn>(
374369
CLContext, cl_ext::ExtFuncPtrCache->clEnqueueMemcpyINTELCache,
375-
cl_ext::EnqueueMemcpyName, &FuncPtr);
370+
cl_ext::EnqueueMemcpyName, &USMMemcpy));
376371

377-
if (FuncPtr) {
378-
RetVal = mapCLErrorToUR(
379-
FuncPtr(cl_adapter::cast<cl_command_queue>(hQueue), blocking, pDst,
380-
pSrc, size, numEventsInWaitList,
381-
cl_adapter::cast<const cl_event *>(phEventWaitList),
382-
cl_adapter::cast<cl_event *>(phEvent)));
372+
clMemBlockingFreeINTEL_fn USMFree = nullptr;
373+
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clMemBlockingFreeINTEL_fn>(
374+
CLContext, cl_ext::ExtFuncPtrCache->clMemBlockingFreeINTELCache,
375+
cl_ext::MemBlockingFreeName, &USMFree));
376+
377+
// Check if the two allocations are DEVICE allocations from different
378+
// devices, if they are we need to do the copy indirectly via a host
379+
// allocation.
380+
cl_device_id SrcDevice = 0, DstDevice = 0;
381+
CL_RETURN_ON_FAILURE(
382+
GetMemAllocInfo(CLContext, pSrc, CL_MEM_ALLOC_DEVICE_INTEL,
383+
sizeof(cl_device_id), &SrcDevice, nullptr));
384+
CL_RETURN_ON_FAILURE(
385+
GetMemAllocInfo(CLContext, pDst, CL_MEM_ALLOC_DEVICE_INTEL,
386+
sizeof(cl_device_id), &DstDevice, nullptr));
387+
388+
if ((SrcDevice && DstDevice) && SrcDevice != DstDevice) {
389+
// We need a queue associated with each device, so first figure out which
390+
// one we weren't given.
391+
cl_device_id QueueDevice = nullptr;
392+
CL_RETURN_ON_FAILURE(clGetCommandQueueInfo(
393+
cl_adapter::cast<cl_command_queue>(hQueue), CL_QUEUE_DEVICE,
394+
sizeof(QueueDevice), &QueueDevice, nullptr));
395+
396+
cl_command_queue MissingQueue = nullptr, SrcQueue = nullptr,
397+
DstQueue = nullptr;
398+
if (QueueDevice == SrcDevice) {
399+
MissingQueue = clCreateCommandQueue(CLContext, DstDevice, 0, &CLErr);
400+
SrcQueue = cl_adapter::cast<cl_command_queue>(hQueue);
401+
DstQueue = MissingQueue;
402+
} else {
403+
MissingQueue = clCreateCommandQueue(CLContext, SrcDevice, 0, &CLErr);
404+
DstQueue = cl_adapter::cast<cl_command_queue>(hQueue);
405+
SrcQueue = MissingQueue;
406+
}
407+
CL_RETURN_ON_FAILURE(CLErr);
408+
409+
cl_event HostCopyEvent = nullptr, FinalCopyEvent = nullptr;
410+
clHostMemAllocINTEL_fn HostMemAlloc = nullptr;
411+
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clHostMemAllocINTEL_fn>(
412+
CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache,
413+
cl_ext::HostMemAllocName, &HostMemAlloc));
414+
415+
auto HostAlloc = HostMemAlloc(CLContext, nullptr, size, 0, &CLErr);
416+
CL_RETURN_ON_FAILURE(CLErr);
417+
418+
// Now that we've successfully allocated we should try to clean it up if we
419+
// hit an error somewhere.
420+
auto checkCLErr = [&](cl_int CLErr) -> ur_result_t {
421+
if (CLErr != CL_SUCCESS) {
422+
if (HostCopyEvent) {
423+
clReleaseEvent(HostCopyEvent);
424+
}
425+
if (FinalCopyEvent) {
426+
clReleaseEvent(FinalCopyEvent);
427+
}
428+
USMFree(CLContext, HostAlloc);
429+
CL_RETURN_ON_FAILURE(CLErr);
430+
}
431+
return UR_RESULT_SUCCESS;
432+
};
433+
434+
UR_RETURN_ON_FAILURE(checkCLErr(USMMemcpy(
435+
SrcQueue, blocking, HostAlloc, pSrc, size, numEventsInWaitList,
436+
cl_adapter::cast<const cl_event *>(phEventWaitList), &HostCopyEvent)));
437+
438+
UR_RETURN_ON_FAILURE(
439+
checkCLErr(USMMemcpy(DstQueue, blocking, pDst, HostAlloc, size, 1,
440+
&HostCopyEvent, &FinalCopyEvent)));
441+
442+
// If this is a blocking operation we can do our cleanup immediately,
443+
// otherwise we need to defer it to an event callback.
444+
if (blocking) {
445+
CL_RETURN_ON_FAILURE(USMFree(CLContext, HostAlloc));
446+
CL_RETURN_ON_FAILURE(clReleaseEvent(HostCopyEvent));
447+
CL_RETURN_ON_FAILURE(clReleaseCommandQueue(MissingQueue));
448+
if (phEvent) {
449+
*phEvent = cl_adapter::cast<ur_event_handle_t>(FinalCopyEvent);
450+
} else {
451+
CL_RETURN_ON_FAILURE(clReleaseEvent(FinalCopyEvent));
452+
}
453+
} else {
454+
if (phEvent) {
455+
*phEvent = cl_adapter::cast<ur_event_handle_t>(FinalCopyEvent);
456+
// We are going to release this event in our callback so we need to
457+
// retain if the user wants a copy.
458+
CL_RETURN_ON_FAILURE(clRetainEvent(FinalCopyEvent));
459+
}
460+
461+
// This self destructs taking the event and allocation with it.
462+
auto DeleterInfo = new AllocDeleterCallbackInfoWithQueue(
463+
USMFree, CLContext, HostAlloc, MissingQueue);
464+
465+
CLErr = clSetEventCallback(
466+
HostCopyEvent, CL_COMPLETE,
467+
AllocDeleterCallback<AllocDeleterCallbackInfoWithQueue>, DeleterInfo);
468+
469+
if (CLErr != CL_SUCCESS) {
470+
// We can attempt to recover gracefully by attempting to wait for the
471+
// copy to finish and deleting the info struct here.
472+
clWaitForEvents(1, &HostCopyEvent);
473+
delete DeleterInfo;
474+
clReleaseEvent(HostCopyEvent);
475+
CL_RETURN_ON_FAILURE(CLErr);
476+
}
477+
}
478+
} else {
479+
CL_RETURN_ON_FAILURE(
480+
USMMemcpy(cl_adapter::cast<cl_command_queue>(hQueue), blocking, pDst,
481+
pSrc, size, numEventsInWaitList,
482+
cl_adapter::cast<const cl_event *>(phEventWaitList),
483+
cl_adapter::cast<cl_event *>(phEvent)));
383484
}
384485

385-
return RetVal;
486+
return UR_RESULT_SUCCESS;
386487
}
387488

388489
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(

source/adapters/opencl/usm.hpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
//===--------------------- usm.hpp - OpenCL Adapter -----------------------===//
2+
//
3+
// Copyright (C) 2024 Intel Corporation
4+
//
5+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
// Exceptions. See LICENSE.TXT
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "CL/cl_ext.h"
12+
#include <CL/cl.h>
13+
14+
// This struct is intended to be used in conjunction with the below callback via
15+
// clSetEventCallback to release temporary allocations created by the adapter to
16+
// implement certain USM operations.
17+
//
18+
// Example usage:
19+
//
20+
// auto Info = new AllocDeleterCallbackInfo(USMFreeFuncPtr, Context,
21+
// Allocation); clSetEventCallback(USMOpEvent, CL_COMPLETE,
22+
// AllocDeleterCallback, Info);
23+
struct AllocDeleterCallbackInfo {
24+
AllocDeleterCallbackInfo(clMemBlockingFreeINTEL_fn USMFree,
25+
cl_context CLContext, void *Allocation)
26+
: USMFree(USMFree), CLContext(CLContext), Allocation(Allocation) {
27+
clRetainContext(CLContext);
28+
}
29+
~AllocDeleterCallbackInfo() {
30+
USMFree(CLContext, Allocation);
31+
clReleaseContext(CLContext);
32+
}
33+
AllocDeleterCallbackInfo(const AllocDeleterCallbackInfo &) = delete;
34+
AllocDeleterCallbackInfo &
35+
operator=(const AllocDeleterCallbackInfo &) = delete;
36+
37+
clMemBlockingFreeINTEL_fn USMFree;
38+
cl_context CLContext;
39+
void *Allocation;
40+
};
41+
42+
struct AllocDeleterCallbackInfoWithQueue : AllocDeleterCallbackInfo {
43+
AllocDeleterCallbackInfoWithQueue(clMemBlockingFreeINTEL_fn USMFree,
44+
cl_context CLContext, void *Allocation,
45+
cl_command_queue CLQueue)
46+
: AllocDeleterCallbackInfo(USMFree, CLContext, Allocation),
47+
CLQueue(CLQueue) {
48+
clRetainContext(CLContext);
49+
}
50+
~AllocDeleterCallbackInfoWithQueue() { clReleaseCommandQueue(CLQueue); }
51+
AllocDeleterCallbackInfoWithQueue(const AllocDeleterCallbackInfoWithQueue &) =
52+
delete;
53+
AllocDeleterCallbackInfoWithQueue &
54+
operator=(const AllocDeleterCallbackInfoWithQueue &) = delete;
55+
56+
cl_command_queue CLQueue;
57+
};
58+
59+
template <class T>
60+
void AllocDeleterCallback(cl_event event, cl_int, void *pUserData);

0 commit comments

Comments
 (0)