@@ -116,21 +116,29 @@ void copy_numpy_ndarray_into_usm_ndarray(
116116
117117 // check for applicability of special cases:
118118 // (same type && (both C-contiguous || both F-contiguous)
119- bool both_c_contig =
119+ const bool both_c_contig =
120120 ((src_flags & py::array::c_style) && dst.is_c_contiguous ());
121- bool both_f_contig =
121+ const bool both_f_contig =
122122 ((src_flags & py::array::f_style) && dst.is_f_contiguous ());
123+
124+ const bool same_data_types = (src_type_id == dst_type_id);
125+
123126 if (both_c_contig || both_f_contig) {
124- if (src_type_id == dst_type_id ) {
127+ if (same_data_types ) {
125128 int src_elem_size = npy_src.itemsize ();
126129
127130 sycl::event copy_ev =
128131 exec_q.memcpy (static_cast <void *>(dst_data),
129132 static_cast <const void *>(src_data),
130133 src_nelems * src_elem_size, depends);
131134
132- // wait for copy_ev to complete
133- copy_ev.wait ();
135+ {
136+ // wait for copy_ev to complete
137+ // release GIL to allow other threads (host_tasks)
138+ // a chance to acquire GIL
139+ py::gil_scoped_release lock{};
140+ copy_ev.wait ();
141+ }
134142
135143 return ;
136144 }
@@ -202,6 +210,30 @@ void copy_numpy_ndarray_into_usm_ndarray(
202210 simplified_dst_strides.push_back (1 );
203211 }
204212
213+ const bool can_use_memcpy =
214+ (same_data_types && (nd == 1 ) && (src_offset == 0 ) &&
215+ (dst_offset == 0 ) && (simplified_src_strides[0 ] == 1 ) &&
216+ (simplified_dst_strides[0 ] == 1 ));
217+
218+ if (can_use_memcpy) {
219+ int src_elem_size = npy_src.itemsize ();
220+
221+ sycl::event copy_ev = exec_q.memcpy (
222+ static_cast <void *>(dst_data), static_cast <const void *>(src_data),
223+ src_nelems * src_elem_size, depends);
224+
225+ {
226+ // wait for copy_ev to complete
227+ // release GIL to allow other threads (host_tasks)
228+ // a chance to acquire GIL
229+ py::gil_scoped_release lock{};
230+
231+ copy_ev.wait ();
232+ }
233+
234+ return ;
235+ }
236+
205237 // Minimum and maximum element offsets for source np.ndarray
206238 py::ssize_t npy_src_min_nelem_offset (src_offset);
207239 py::ssize_t npy_src_max_nelem_offset (src_offset);
@@ -230,17 +262,22 @@ void copy_numpy_ndarray_into_usm_ndarray(
230262 }
231263 const sycl::event ©_shape_ev = std::get<2 >(ptr_size_event_tuple);
232264
233- // Get implementation function pointer
234- auto copy_and_cast_from_host_blocking_fn =
235- copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
236- [src_type_id];
265+ {
266+ // release GIL for the blocking call
267+ py::gil_scoped_release lock{};
237268
238- copy_and_cast_from_host_blocking_fn (
239- exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
240- npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
241- dst_offset, depends, {copy_shape_ev}) ;
269+ // Get implementation function pointer
270+ auto copy_and_cast_from_host_blocking_fn =
271+ copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
272+ [src_type_id] ;
242273
243- sycl::free (shape_strides, exec_q);
274+ copy_and_cast_from_host_blocking_fn (
275+ exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
276+ npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
277+ dst_offset, depends, {copy_shape_ev});
278+
279+ sycl::free (shape_strides, exec_q);
280+ }
244281
245282 return ;
246283}
0 commit comments