@@ -724,12 +724,16 @@ void exec_graph_impl::findRealDeps(
724
724
}
725
725
}
726
726
727
- ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect (
728
- const sycl::context &Ctx, sycl::detail::device_impl &DeviceImpl,
729
- ur_exp_command_buffer_handle_t CommandBuffer, node_impl &Node) {
727
+ std::optional<ur_exp_command_buffer_sync_point_t >
728
+ exec_graph_impl::enqueueNodeDirect (const sycl::context &Ctx,
729
+ sycl::detail::device_impl &DeviceImpl,
730
+ ur_exp_command_buffer_handle_t CommandBuffer,
731
+ node_impl &Node, bool IsInOrderPartition) {
730
732
std::vector<ur_exp_command_buffer_sync_point_t > Deps;
731
- for (node_impl &N : Node.predecessors ()) {
732
- findRealDeps (Deps, N, MPartitionNodes[&Node]);
733
+ if (!IsInOrderPartition) {
734
+ for (node_impl &N : Node.predecessors ()) {
735
+ findRealDeps (Deps, N, MPartitionNodes[&Node]);
736
+ }
733
737
}
734
738
ur_exp_command_buffer_sync_point_t NewSyncPoint;
735
739
ur_exp_command_buffer_command_handle_t NewCommand = 0 ;
@@ -758,7 +762,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect(
758
762
ur_result_t Res = sycl::detail::enqueueImpCommandBufferKernel (
759
763
Ctx, DeviceImpl, CommandBuffer,
760
764
*static_cast <sycl::detail::CGExecKernel *>((Node.MCommandGroup .get ())),
761
- Deps, &NewSyncPoint, MIsUpdatable ? &NewCommand : nullptr , nullptr );
765
+ Deps, IsInOrderPartition ? nullptr : &NewSyncPoint,
766
+ MIsUpdatable ? &NewCommand : nullptr , nullptr );
762
767
763
768
if (MIsUpdatable) {
764
769
MCommandMap[&Node] = NewCommand;
@@ -775,16 +780,21 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect(
775
780
StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_end, nullptr );
776
781
#endif
777
782
778
- return NewSyncPoint;
783
+ // Linear (in-order) graphs do not return a sync point as the dependencies of
784
+ // successor nodes are handled by the UR CommandBuffer via the isInOrder flag
785
+ return IsInOrderPartition
786
+ ? std::nullopt
787
+ : std::optional<ur_exp_command_buffer_sync_point_t >{NewSyncPoint};
779
788
}
780
789
781
- ur_exp_command_buffer_sync_point_t
790
+ std::optional< ur_exp_command_buffer_sync_point_t >
782
791
exec_graph_impl::enqueueNode (ur_exp_command_buffer_handle_t CommandBuffer,
783
- node_impl &Node) {
784
-
792
+ node_impl &Node, bool IsInOrderPartition) {
785
793
std::vector<ur_exp_command_buffer_sync_point_t > Deps;
786
- for (node_impl &N : Node.predecessors ()) {
787
- findRealDeps (Deps, N, MPartitionNodes[&Node]);
794
+ if (!IsInOrderPartition) {
795
+ for (node_impl &N : Node.predecessors ()) {
796
+ findRealDeps (Deps, N, MPartitionNodes[&Node]);
797
+ }
788
798
}
789
799
790
800
sycl::detail::EventImplPtr Event =
@@ -796,7 +806,11 @@ exec_graph_impl::enqueueNode(ur_exp_command_buffer_handle_t CommandBuffer,
796
806
MCommandMap[&Node] = Event->getCommandBufferCommand ();
797
807
}
798
808
799
- return Event->getSyncPoint ();
809
+ // Linear (in-order) graphs do not return a sync point as the dependencies of
810
+ // successor nodes are handled by the UR CommandBuffer via the isInOrder flag
811
+ return IsInOrderPartition ? std::nullopt
812
+ : std::optional<ur_exp_command_buffer_sync_point_t >{
813
+ Event->getSyncPoint ()};
800
814
}
801
815
802
816
void exec_graph_impl::buildRequirements () {
@@ -825,10 +839,12 @@ void exec_graph_impl::buildRequirements() {
825
839
826
840
void exec_graph_impl::createCommandBuffers (
827
841
sycl::device Device, std::shared_ptr<partition> &Partition) {
842
+ const bool IsInOrderCommandBuffer =
843
+ Partition->MIsInOrderGraph && !MEnableProfiling;
828
844
ur_exp_command_buffer_handle_t OutCommandBuffer;
829
- ur_exp_command_buffer_desc_t Desc{
830
- UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr , MIsUpdatable,
831
- Partition-> MIsInOrderGraph && !MEnableProfiling , MEnableProfiling};
845
+ ur_exp_command_buffer_desc_t Desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
846
+ nullptr , MIsUpdatable,
847
+ IsInOrderCommandBuffer , MEnableProfiling};
832
848
context_impl &ContextImpl = *sycl::detail::getSyclObjImpl (MContext);
833
849
sycl::detail::adapter_impl &Adapter = ContextImpl.getAdapter ();
834
850
sycl::detail::device_impl &DeviceImpl = *sycl::detail::getSyclObjImpl (Device);
@@ -857,10 +873,20 @@ void exec_graph_impl::createCommandBuffers(
857
873
Node.MCommandGroup .get ())
858
874
->MStreams .size () ==
859
875
0 ) {
860
- MSyncPoints[&Node] =
861
- enqueueNodeDirect (MContext, DeviceImpl, OutCommandBuffer, Node);
876
+ if (auto OptSyncPoint =
877
+ enqueueNodeDirect (MContext, DeviceImpl, OutCommandBuffer, Node,
878
+ IsInOrderCommandBuffer)) {
879
+ assert (!IsInOrderCommandBuffer &&
880
+ " In-order partitions should not create a sync point" );
881
+ MSyncPoints[&Node] = *OptSyncPoint;
882
+ }
862
883
} else {
863
- MSyncPoints[&Node] = enqueueNode (OutCommandBuffer, Node);
884
+ if (auto OptSyncPoint =
885
+ enqueueNode (OutCommandBuffer, Node, IsInOrderCommandBuffer)) {
886
+ assert (!IsInOrderCommandBuffer &&
887
+ " In-order partitions should not create a sync point" );
888
+ MSyncPoints[&Node] = *OptSyncPoint;
889
+ }
864
890
}
865
891
}
866
892
0 commit comments