@@ -245,6 +245,7 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::setup(
245245 heavy_degree_cutoff,
246246 problem.cnst_bin_offsets ,
247247 problem.offsets );
248+ RAFT_CHECK_CUDA (stream_heavy_cnst);
248249
249250 num_blocks_heavy_vars = create_heavy_item_block_segments (stream_heavy_vars,
250251 heavy_vars_vertex_ids,
@@ -253,49 +254,34 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::setup(
253254 heavy_degree_cutoff,
254255 problem.vars_bin_offsets ,
255256 problem.reverse_offsets );
257+ RAFT_CHECK_CUDA (stream_heavy_vars);
256258
257259 tmp_act.resize (2 * num_blocks_heavy_cnst, stream_heavy_cnst);
258260 tmp_bnd.resize (2 * num_blocks_heavy_vars, stream_heavy_vars);
259261
260- std::tie (is_cnst_sub_warp_single_bin, cnst_sub_warp_count) = sub_warp_meta (
261- streams. get_stream () , warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets , 4 );
262+ std::tie (is_cnst_sub_warp_single_bin, cnst_sub_warp_count) =
263+ sub_warp_meta (stream , warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets , 4 );
262264
263- std::tie (is_vars_sub_warp_single_bin, vars_sub_warp_count) = sub_warp_meta (
264- streams. get_stream () , warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets , 4 );
265+ std::tie (is_vars_sub_warp_single_bin, vars_sub_warp_count) =
266+ sub_warp_meta (stream , warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets , 4 );
265267
266- stream.synchronize ();
267- streams.sync_all_issued ();
268+ // stream.synchronize();
269+ RAFT_CHECK_CUDA (stream);
270+ streams.sync_test_all_issued ();
268271
269272 if (!calc_slack_erase_inf_cnst_graph_created) {
270- bool erase_inf_cnst = true ;
271- calc_slack_erase_inf_cnst_graph_created = build_graph (
272- streams,
273- handle_ptr,
274- calc_slack_erase_inf_cnst_graph,
275- calc_slack_erase_inf_cnst_exec,
276- [erase_inf_cnst, this ]() { this ->calculate_activity_graph (erase_inf_cnst, true ); },
277- [erase_inf_cnst, this ]() { this ->calculate_activity_graph (erase_inf_cnst); });
273+ create_constraint_slack_graph (true );
274+ calc_slack_erase_inf_cnst_graph_created = true ;
278275 }
279276
280277 if (!calc_slack_graph_created) {
281- bool erase_inf_cnst = false ;
282- calc_slack_graph_created = build_graph (
283- streams,
284- handle_ptr,
285- calc_slack_graph,
286- calc_slack_exec,
287- [erase_inf_cnst, this ]() { this ->calculate_activity_graph (erase_inf_cnst, true ); },
288- [erase_inf_cnst, this ]() { this ->calculate_activity_graph (erase_inf_cnst); });
278+ create_constraint_slack_graph (false );
279+ calc_slack_graph_created = true ;
289280 }
290281
291282 if (!upd_bnd_graph_created) {
292- upd_bnd_graph_created = build_graph (
293- streams,
294- handle_ptr,
295- upd_bnd_graph,
296- upd_bnd_exec,
297- [this ]() { this ->calculate_bounds_update_graph (true ); },
298- [this ]() { this ->calculate_bounds_update_graph (); });
283+ create_bounds_update_graph ();
284+ upd_bnd_graph_created = true ;
299285 }
300286}
301287
@@ -368,6 +354,119 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_activity_graph(bool er
368354 dry_run);
369355}
370356
357+ template <typename i_t , typename f_t >
358+ void load_balanced_bounds_presolve_t <i_t , f_t >::create_bounds_update_graph()
359+ {
360+ using f_t2 = typename type_2<f_t >::type;
361+ cudaGraph_t upd_graph;
362+ cudaGraphCreate (&upd_graph, 0 );
363+ cudaGraphNode_t bounds_changed_node;
364+ {
365+ i_t * bounds_changed_ptr = bounds_changed.data ();
366+
367+ cudaMemcpy3DParms memcpyParams = {0 };
368+ memcpyParams.srcArray = NULL ;
369+ memcpyParams.srcPos = make_cudaPos (0 , 0 , 0 );
370+ memcpyParams.srcPtr = make_cudaPitchedPtr (bounds_changed_ptr, sizeof (i_t ), 1 , 1 );
371+ memcpyParams.dstArray = NULL ;
372+ memcpyParams.dstPos = make_cudaPos (0 , 0 , 0 );
373+ memcpyParams.dstPtr = make_cudaPitchedPtr (&h_bounds_changed, sizeof (i_t ), 1 , 1 );
374+ memcpyParams.extent = make_cudaExtent (sizeof (i_t ), 1 , 1 );
375+ memcpyParams.kind = cudaMemcpyDeviceToHost;
376+ cudaGraphAddMemcpyNode (&bounds_changed_node, upd_graph, NULL , 0 , &memcpyParams);
377+ }
378+
379+ auto bounds_update_view = get_bounds_update_view (*pb);
380+
381+ create_update_bounds_heavy_vars<i_t , f_t , f_t2, 640 >(upd_graph,
382+ bounds_changed_node,
383+ bounds_update_view,
384+ make_span_2 (tmp_bnd),
385+ heavy_vars_vertex_ids,
386+ heavy_vars_pseudo_block_ids,
387+ heavy_vars_block_segments,
388+ pb->vars_bin_offsets ,
389+ heavy_degree_cutoff,
390+ num_blocks_heavy_vars);
391+ RAFT_CUDA_TRY (cudaGetLastError ());
392+ create_update_bounds_per_block<i_t , f_t , f_t2>(
393+ upd_graph, bounds_changed_node, bounds_update_view, pb->vars_bin_offsets , heavy_degree_cutoff);
394+ RAFT_CUDA_TRY (cudaGetLastError ());
395+ create_update_bounds_sub_warp<i_t , f_t , f_t2>(upd_graph,
396+ bounds_changed_node,
397+ bounds_update_view,
398+ is_vars_sub_warp_single_bin,
399+ vars_sub_warp_count,
400+ warp_vars_offsets,
401+ warp_vars_id_offsets,
402+ pb->vars_bin_offsets );
403+ RAFT_CUDA_TRY (cudaGetLastError ());
404+ cudaGraphDebugDotPrint (upd_graph, " /home/aatish/debug_upd_graph" , 0 );
405+ RAFT_CUDA_TRY (cudaGetLastError ());
406+ cudaGraphInstantiate (&upd_bnd_exec, upd_graph, NULL , NULL , 0 );
407+ RAFT_CUDA_TRY (cudaGetLastError ());
408+ }
409+
410+ template <typename i_t , typename f_t >
411+ void load_balanced_bounds_presolve_t <i_t , f_t >::create_constraint_slack_graph(bool erase_inf_cnst)
412+ {
413+ using f_t2 = typename type_2<f_t >::type;
414+ cudaGraph_t cnst_slack_graph;
415+ cudaGraphCreate (&cnst_slack_graph, 0 );
416+
417+ cudaGraphNode_t set_bounds_changed_node;
418+ {
419+ // TODO : Investigate why memset node is not captured manually
420+ i_t * bounds_changed_ptr = bounds_changed.data ();
421+
422+ cudaMemcpy3DParms memcpyParams = {0 };
423+ memcpyParams.srcArray = NULL ;
424+ memcpyParams.srcPos = make_cudaPos (0 , 0 , 0 );
425+ memcpyParams.srcPtr = make_cudaPitchedPtr (&h_bounds_changed, sizeof (i_t ), 1 , 1 );
426+ memcpyParams.dstArray = NULL ;
427+ memcpyParams.dstPos = make_cudaPos (0 , 0 , 0 );
428+ memcpyParams.dstPtr = make_cudaPitchedPtr (bounds_changed_ptr, sizeof (i_t ), 1 , 1 );
429+ memcpyParams.extent = make_cudaExtent (sizeof (i_t ), 1 , 1 );
430+ memcpyParams.kind = cudaMemcpyHostToDevice;
431+ cudaGraphAddMemcpyNode (&set_bounds_changed_node, cnst_slack_graph, NULL , 0 , &memcpyParams);
432+ }
433+
434+ auto activity_view = get_activity_view (*pb);
435+
436+ create_activity_heavy_cnst<i_t , f_t , f_t2, 512 >(cnst_slack_graph,
437+ set_bounds_changed_node,
438+ activity_view,
439+ make_span_2 (tmp_act),
440+ heavy_cnst_vertex_ids,
441+ heavy_cnst_pseudo_block_ids,
442+ heavy_cnst_block_segments,
443+ pb->cnst_bin_offsets ,
444+ heavy_degree_cutoff,
445+ num_blocks_heavy_cnst,
446+ erase_inf_cnst);
447+ create_activity_per_block<i_t , f_t , f_t2>(cnst_slack_graph,
448+ set_bounds_changed_node,
449+ activity_view,
450+ pb->cnst_bin_offsets ,
451+ heavy_degree_cutoff,
452+ erase_inf_cnst);
453+ create_activity_sub_warp<i_t , f_t , f_t2>(cnst_slack_graph,
454+ set_bounds_changed_node,
455+ activity_view,
456+ is_cnst_sub_warp_single_bin,
457+ cnst_sub_warp_count,
458+ warp_cnst_offsets,
459+ warp_cnst_id_offsets,
460+ pb->cnst_bin_offsets ,
461+ erase_inf_cnst);
462+ cudaGraphDebugDotPrint (cnst_slack_graph, " /home/aatish/debug_cnst_slack_graph" , 0 );
463+ if (erase_inf_cnst) {
464+ cudaGraphInstantiate (&calc_slack_erase_inf_cnst_exec, cnst_slack_graph, NULL , NULL , 0 );
465+ } else {
466+ cudaGraphInstantiate (&calc_slack_exec, cnst_slack_graph, NULL , NULL , 0 );
467+ }
468+ }
469+
371470template <typename i_t , typename f_t >
372471void load_balanced_bounds_presolve_t <i_t , f_t >::calculate_bounds_update_graph(bool dry_run)
373472{
@@ -401,12 +500,13 @@ template <typename i_t, typename f_t>
401500void load_balanced_bounds_presolve_t <i_t , f_t >::calculate_constraint_slack_iter(
402501 const raft::handle_t * handle_ptr)
403502{
503+ // h_bounds_changed is copied to bounds_changed in calc_slack_exec
504+ h_bounds_changed = 0 ;
404505 {
405506 // writes nans to constraint activities that are infeasible
406507 // -> less expensive checks for update bounds step
407508 raft::common::nvtx::range scope (" act_cuda_task_graph" );
408509 cudaGraphLaunch (calc_slack_erase_inf_cnst_exec, handle_ptr->get_stream ());
409- handle_ptr->sync_stream ();
410510 }
411511 infeas_cnst_slack_set_to_nan = true ;
412512 RAFT_CHECK_CUDA (handle_ptr->get_stream ());
@@ -416,6 +516,8 @@ template <typename i_t, typename f_t>
416516void load_balanced_bounds_presolve_t <i_t , f_t >::calculate_constraint_slack(
417517 const raft::handle_t * handle_ptr)
418518{
519+ // h_bounds_changed is copied to bounds_changed in calc_slack_exec
520+ h_bounds_changed = 0 ;
419521 {
420522 raft::common::nvtx::range scope (" act_cuda_task_graph" );
421523 cudaGraphLaunch (calc_slack_exec, handle_ptr->get_stream ());
@@ -428,13 +530,10 @@ template <typename i_t, typename f_t>
428530bool load_balanced_bounds_presolve_t <i_t , f_t >::update_bounds_from_slack(
429531 const raft::handle_t * handle_ptr)
430532{
431- i_t h_bounds_changed;
432- bounds_changed.set_value_to_zero_async (handle_ptr->get_stream ());
433-
533+ // bounds_changed is copied to h_bounds_changed in upd_bnd_exec
434534 {
435535 raft::common::nvtx::range scope (" upd_cuda_task_graph" );
436536 cudaGraphLaunch (upd_bnd_exec, handle_ptr->get_stream ());
437- h_bounds_changed = bounds_changed.value (handle_ptr->get_stream ());
438537 }
439538 RAFT_CHECK_CUDA (handle_ptr->get_stream ());
440539 constexpr i_t zero = 0 ;
0 commit comments