Skip to content

Commit e1189e5

Browse files
SyclEvent deleter no longer waits
Added _host_task_utils.hpp file that provides function to submit host task to decrement reference count for each Python object passed to one gated by passed event references. Host task is submitted to sycl::queue behind the given QRef. SyclQueue.submit changes the mechanism by which it ensures that Python objects (and memory they hold) that the kernel works on are not GC collected before the kernel completes its execution. We used to store reference to arguments in the SyclEvent object returned by SyclQueue.submit. To avoid GC-ing kernel arguments before kernel completes execution, deleter of SyclEvent has to call `SyclEvent.wait`. With this PR `SyclQueue.submit` increments reference counts of arguments after submission, then schedules a host task on the same queue, dependent on the kernel event, which aquires GIL and decrements reference counts. After this change the SyclEven.__dealloc__ no longer has to wait. It also no longer needs to store args attribute. This is now deperecated and will be removed.
1 parent 05a5abd commit e1189e5

File tree

4 files changed

+66
-4
lines changed

4 files changed

+66
-4
lines changed

dpctl/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ foreach(_cy_file ${_cython_sources})
161161
build_dpctl_ext(${_trgt} ${_cy_file} "dpctl")
162162
endforeach()
163163

164+
target_include_directories(_sycl_queue PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
165+
164166
add_subdirectory(program)
165167
add_subdirectory(memory)
166168
add_subdirectory(tensor)

dpctl/_host_task_util.hpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include "Python.h"
2+
#include "syclinterface/dpctl_data_types.h"
3+
#include <CL/sycl.hpp>
4+
5+
int async_dec_ref(DPCTLSyclQueueRef QRef,
6+
PyObject **obj_array,
7+
size_t obj_array_size,
8+
DPCTLSyclEventRef *ERefs,
9+
size_t nERefs)
10+
{
11+
12+
sycl::queue *q = reinterpret_cast<sycl::queue *>(QRef);
13+
14+
std::vector<PyObject *> obj_vec;
15+
obj_vec.reserve(obj_array_size);
16+
for (size_t obj_id = 0; obj_id < obj_array_size; ++obj_id) {
17+
obj_vec.push_back(obj_array[obj_id]);
18+
}
19+
20+
try {
21+
q->submit([&](sycl::handler &cgh) {
22+
for (size_t ev_id = 0; ev_id < nERefs; ++ev_id) {
23+
cgh.depends_on(
24+
*(reinterpret_cast<sycl::event *>(ERefs[ev_id])));
25+
}
26+
cgh.host_task([obj_array_size, obj_vec]() {
27+
{
28+
PyGILState_STATE gstate;
29+
gstate = PyGILState_Ensure();
30+
for (size_t i = 0; i < obj_array_size; ++i) {
31+
Py_DECREF(obj_vec[i]);
32+
}
33+
PyGILState_Release(gstate);
34+
}
35+
});
36+
});
37+
} catch (const std::exception &e) {
38+
return 1;
39+
}
40+
41+
return 0;
42+
}

dpctl/_sycl_event.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ cdef class _SyclEvent:
9898

9999
def __dealloc__(self):
100100
if (self._event_ref):
101-
with nogil: DPCTLEvent_Wait(self._event_ref)
101+
# with nogil: DPCTLEvent_Wait(self._event_ref)
102102
DPCTLEvent_Delete(self._event_ref)
103103
self._event_ref = NULL
104104
self.args = None

dpctl/_sycl_queue.pyx

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,17 @@ import ctypes
6565
from .enum_types import backend_type
6666

6767
from cpython cimport pycapsule
68+
from cpython.ref cimport Py_INCREF, PyObject
6869
from libc.stdlib cimport free, malloc
6970

7071
import collections.abc
7172
import logging
7273

74+
75+
cdef extern from "_host_task_util.hpp":
76+
int async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t) nogil
77+
78+
7379
__all__ = [
7480
"SyclQueue",
7581
"SyclKernelInvalidRangeError",
@@ -714,12 +720,14 @@ cdef class SyclQueue(_SyclQueue):
714720
cdef _arg_data_type *kargty = NULL
715721
cdef DPCTLSyclEventRef *depEvents = NULL
716722
cdef DPCTLSyclEventRef Eref = NULL
717-
cdef int ret
723+
cdef int ret = 0
718724
cdef size_t gRange[3]
719725
cdef size_t lRange[3]
720726
cdef size_t nGS = len(gS)
721727
cdef size_t nLS = len(lS) if lS is not None else 0
722728
cdef size_t nDE = len(dEvents) if dEvents is not None else 0
729+
cdef PyObject **arg_objects = NULL
730+
cdef ssize_t i = 0
723731

724732
# Allocate the arrays to be sent to DPCTLQueue_Submit
725733
kargs = <void**>malloc(len(args) * sizeof(void*))
@@ -820,8 +828,18 @@ cdef class SyclQueue(_SyclQueue):
820828
raise SyclKernelSubmitError(
821829
"Kernel submission to Sycl queue failed."
822830
)
823-
824-
return SyclEvent._create(Eref, args)
831+
# increment reference counts to each argument
832+
arg_objects = <PyObject **>malloc(len(args) * sizeof(PyObject *))
833+
for i in range(len(args)):
834+
arg_objects[i] = <PyObject *>(args[i])
835+
Py_INCREF(<object> arg_objects[i])
836+
837+
# schedule decrement
838+
async_dec_ref(self.get_queue_ref(), arg_objects, len(args), &Eref, 1)
839+
# free memory
840+
free(arg_objects)
841+
842+
return SyclEvent._create(Eref, [])
825843

826844
cpdef void wait(self):
827845
with nogil: DPCTLQueue_Wait(self._queue_ref)

0 commit comments

Comments
 (0)