@@ -187,11 +187,12 @@ class type_erased_cgfo_ty {
187187
188188public:
189189 template <class T >
190- type_erased_cgfo_ty (T &f)
191- // NOTE: Even if `T ` is a pointer to a function, `&f` is a pointer to a
190+ type_erased_cgfo_ty (T && f)
191+ // NOTE: Even if `f ` is a pointer to a function, `&f` is a pointer to a
192192 // pointer to a function and as such can be casted to `void *` (pointer to
193193 // a function cannot be casted).
194- : object(static_cast <const void *>(&f)), invoker_f(&invoker<T>::call) {}
194+ : object(static_cast <const void *>(&f)),
195+ invoker_f (&invoker<std::remove_reference_t <T>>::call) {}
195196 ~type_erased_cgfo_ty () = default ;
196197
197198 type_erased_cgfo_ty (const type_erased_cgfo_ty &) = delete;
@@ -3878,14 +3879,6 @@ class HandlerAccess {
38783879 Handler.parallel_for_impl (Range, Props, Kernel);
38793880 }
38803881
3881- template <typename T, typename > struct dependent {
3882- using type = T;
3883- };
3884- template <typename T>
3885- using dependent_queue_t = typename dependent<queue, T>::type;
3886- template <typename T>
3887- using dependent_handler_t = typename dependent<handler, T>::type;
3888-
38893882 // pre/postProcess are used only for reductions right now, but the
38903883 // abstractions they provide aren't reduction-specific. The main problem they
38913884 // solve is
@@ -3901,71 +3894,16 @@ class HandlerAccess {
39013894 // inside control group function object (lambda above) so we resort to a
39023895 // somewhat hacky way of creating multiple `handler`s and manual finalization
39033896 // of them (instead of the one in `queue::submit`).
3904- //
3905- // Overloads with `queue &q` are provided in case the caller has it created
3906- // already to avoid unnecessary reference count increments associated with
3907- // `handler::getQueue()`.
3908- template <class FunctorTy >
3909- static void preProcess (handler &CGH, dependent_queue_t <FunctorTy> &q,
3910- FunctorTy Func) {
3911- bool EventNeeded = !q.is_in_order ();
3912- handler AuxHandler (getSyclObjImpl (q), EventNeeded);
3913- AuxHandler.copyCodeLoc (CGH);
3914- std::forward<FunctorTy>(Func)(AuxHandler);
3915- auto E = AuxHandler.finalize ();
3916- assert (!CGH.MIsFinalized &&
3917- " Can't do pre-processing if the command has been enqueued already!" );
3918- if (EventNeeded)
3919- CGH.depends_on (E);
3920- }
3897+ __SYCL_EXPORT static void preProcess (handler &CGH, type_erased_cgfo_ty F);
3898+ __SYCL_EXPORT static void postProcess (handler &CGH, type_erased_cgfo_ty F);
3899+
39213900 template <class FunctorTy >
3922- static void preProcess (dependent_handler_t <FunctorTy> &CGH,
3923- FunctorTy &&Func) {
3924- preProcess (CGH, CGH.getQueue (), std::forward<FunctorTy>(Func));
3901+ static void preProcess (handler &CGH, FunctorTy &Func) {
3902+ preProcess (CGH, type_erased_cgfo_ty{Func});
39253903 }
39263904 template <class FunctorTy >
3927- static void postProcess (dependent_handler_t <FunctorTy> &CGH,
3928- FunctorTy &&Func) {
3929- // The "hacky" `handler`s manipulation mentioned above and implemented here
3930- // is far from perfect. A better approach would be
3931- //
3932- // bool OrigNeedsEvent = CGH.needsEvent()
3933- // assert(CGH.not_finalized/enqueued());
3934- // if (!InOrderQueue)
3935- // CGH.setNeedsEvent()
3936- //
3937- // handler PostProcessHandler(Queue, OrigNeedsEvent)
3938- // auto E = CGH.finalize(); // enqueue original or current last
3939- // // post-process
3940- // if (!InOrder)
3941- // PostProcessHandler.depends_on(E)
3942- //
3943- // swap_impls(CGH, PostProcessHandler)
3944- // return; // queue::submit finalizes PostProcessHandler and returns its
3945- // // event if necessary.
3946- //
3947- // Still hackier than "real" `queue::submit` but at least somewhat sane.
3948- // That, however hasn't been tried yet and we have an even hackier approach
3949- // copied from what's been done in an old reductions implementation before
3950- // eventless submission work has started. Not sure how feasible the approach
3951- // above is at this moment.
3952-
3953- // This `finalize` is wrong (at least logically) if
3954- // `assert(!CGH.eventNeeded())`
3955- auto E = CGH.finalize ();
3956- dependent_queue_t <FunctorTy> Queue = CGH.getQueue ();
3957- bool InOrder = Queue.is_in_order ();
3958- // Cannot use `CGH.eventNeeded()` alone as there might be subsequent
3959- // `postProcess` calls and we cannot address them properly similarly to the
3960- // `finalize` issue described above. `swap_impls` suggested above might be
3961- // able to handle this scenario naturally.
3962- handler AuxHandler (getSyclObjImpl (Queue), CGH.eventNeeded () || !InOrder);
3963- if (!InOrder)
3964- AuxHandler.depends_on (E);
3965- AuxHandler.copyCodeLoc (CGH);
3966- std::forward<FunctorTy>(Func)(AuxHandler);
3967- CGH.MLastEvent = AuxHandler.finalize ();
3968- return ;
3905+ static void postProcess (handler &CGH, FunctorTy &Func) {
3906+ postProcess (CGH, type_erased_cgfo_ty{Func});
39693907 }
39703908};
39713909} // namespace detail
0 commit comments