@@ -200,6 +200,7 @@ struct GPUHistMakerDevice {
200
200
201
201
// Reset values for each update iteration
202
202
[[nodiscard]] DMatrix* Reset (HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
203
+ this ->monitor .Start (__func__);
203
204
auto const & info = p_fmat->Info ();
204
205
this ->column_sampler_ ->Init (ctx_, p_fmat->Info ().num_col_ , info.feature_weights .HostVector (),
205
206
param.colsample_bynode , param.colsample_bylevel ,
@@ -252,7 +253,7 @@ struct GPUHistMakerDevice {
252
253
this ->histogram_ .Reset (ctx_, this ->hist_param_ ->MaxCachedHistNodes (ctx_->Device ()),
253
254
feature_groups->DeviceAccessor (ctx_->Device ()), cuts_->TotalBins (),
254
255
false );
255
-
256
+ this -> monitor . Stop (__func__);
256
257
return p_fmat;
257
258
}
258
259
@@ -346,6 +347,38 @@ struct GPUHistMakerDevice {
346
347
monitor.Stop (__func__);
347
348
}
348
349
350
+ void ReduceHist (DMatrix* p_fmat, std::vector<GPUExpandEntry> const & candidates,
351
+ std::vector<bst_node_t > const & build_nidx,
352
+ std::vector<bst_node_t > const & subtraction_nidx) {
353
+ if (candidates.empty ()) {
354
+ return ;
355
+ }
356
+ this ->monitor .Start (__func__);
357
+
358
+ // Reduce all in one go
359
+ // This gives much better latency in a distributed setting when processing a large batch
360
+ this ->histogram_ .AllReduceHist (ctx_, p_fmat->Info (), build_nidx.at (0 ), build_nidx.size ());
361
+ // Perform subtraction for sibiling nodes
362
+ auto need_build = this ->histogram_ .SubtractHist (candidates, build_nidx, subtraction_nidx);
363
+ if (need_build.empty ()) {
364
+ this ->monitor .Stop (__func__);
365
+ return ;
366
+ }
367
+
368
+ // Build the nodes that can not obtain the histogram using subtraction. This is the slow path.
369
+ std::int32_t k = 0 ;
370
+ for (auto const & page : p_fmat->GetBatches <EllpackPage>(ctx_, StaticBatch (true ))) {
371
+ for (auto nidx : need_build) {
372
+ this ->BuildHist (page, k, nidx);
373
+ }
374
+ ++k;
375
+ }
376
+ for (auto nidx : need_build) {
377
+ this ->histogram_ .AllReduceHist (ctx_, p_fmat->Info (), nidx, 1 );
378
+ }
379
+ this ->monitor .Stop (__func__);
380
+ }
381
+
349
382
void UpdatePositionColumnSplit (EllpackDeviceAccessor d_matrix,
350
383
std::vector<NodeSplitData> const & split_data,
351
384
std::vector<bst_node_t > const & nidx,
@@ -434,56 +467,74 @@ struct GPUHistMakerDevice {
434
467
}
435
468
};
436
469
437
- void UpdatePosition (DMatrix* p_fmat, std::vector<GPUExpandEntry> const & candidates,
438
- RegTree* p_tree) {
439
- if (candidates.empty ()) {
470
+ // Update position and build histogram.
471
+ void PartitionAndBuildHist (DMatrix* p_fmat, std::vector<GPUExpandEntry> const & expand_set,
472
+ std::vector<GPUExpandEntry> const & candidates, RegTree const * p_tree) {
473
+ if (expand_set.empty ()) {
440
474
return ;
441
475
}
442
-
443
476
monitor.Start (__func__);
477
+ CHECK_LE (candidates.size (), expand_set.size ());
444
478
445
- auto [nidx, left_nidx, right_nidx, split_data] = this ->CreatePartitionNodes (p_tree, candidates);
479
+ // Update all the nodes if working with external memory, this saves us from working
480
+ // with the finalize position call, which adds an additional iteration and requires
481
+ // special handling for row index.
482
+ bool const is_single_block = p_fmat->SingleColBlock ();
446
483
447
- for (size_t i = 0 ; i < candidates.size (); i++) {
448
- auto const & e = candidates[i];
449
- RegTree::Node const & split_node = (*p_tree)[e.nid ];
450
- auto split_type = p_tree->NodeSplitType (e.nid );
451
- nidx[i] = e.nid ;
452
- left_nidx[i] = split_node.LeftChild ();
453
- right_nidx[i] = split_node.RightChild ();
454
- split_data[i] = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats (e.nid )};
484
+ // Prepare for update partition
485
+ auto [nidx, left_nidx, right_nidx, split_data] =
486
+ this ->CreatePartitionNodes (p_tree, is_single_block ? candidates : expand_set);
455
487
456
- CHECK_EQ (split_type == FeatureType::kCategorical , e.split .is_cat );
457
- }
488
+ // Prepare for build hist
489
+ std::vector<bst_node_t > build_nidx (candidates.size ());
490
+ std::vector<bst_node_t > subtraction_nidx (candidates.size ());
491
+ auto prefetch_copy =
492
+ AssignNodes (p_tree, this ->quantiser .get (), candidates, build_nidx, subtraction_nidx);
458
493
459
- CHECK_EQ (p_fmat->NumBatches (), 1 );
460
- for (auto const & page : p_fmat->GetBatches <EllpackPage>(ctx_, StaticBatch (true ))) {
494
+ this ->histogram_ .AllocateHistograms (ctx_, build_nidx, subtraction_nidx);
495
+
496
+ monitor.Start (" Partition-BuildHist" );
497
+
498
+ std::int32_t k{0 };
499
+ for (auto const & page : p_fmat->GetBatches <EllpackPage>(ctx_, StaticBatch (prefetch_copy))) {
461
500
auto d_matrix = page.Impl ()->GetDeviceAccessor (ctx_->Device ());
501
+ auto go_left = GoLeftOp{d_matrix};
462
502
503
+ // Partition histogram.
504
+ monitor.Start (" UpdatePositionBatch" );
463
505
if (p_fmat->Info ().IsColumnSplit ()) {
464
506
UpdatePositionColumnSplit (d_matrix, split_data, nidx, left_nidx, right_nidx);
465
- monitor.Stop (__func__);
466
- return ;
507
+ } else {
508
+ partitioners_.at (k)->UpdatePositionBatch (
509
+ nidx, left_nidx, right_nidx, split_data,
510
+ [=] __device__ (cuda_impl::RowIndexT ridx, int /* nidx_in_batch*/ ,
511
+ const NodeSplitData& data) { return go_left (ridx, data); });
467
512
}
468
- auto go_left = GoLeftOp{d_matrix};
469
- partitioners_.front ()->UpdatePositionBatch (
470
- nidx, left_nidx, right_nidx, split_data,
471
- [=] __device__ (cuda_impl::RowIndexT ridx, int /* nidx_in_batch*/ ,
472
- const NodeSplitData& data) { return go_left (ridx, data); });
513
+ monitor.Stop (" UpdatePositionBatch" );
514
+
515
+ for (auto nidx : build_nidx) {
516
+ this ->BuildHist (page, k, nidx);
517
+ }
518
+
519
+ ++k;
473
520
}
474
521
522
+ monitor.Stop (" Partition-BuildHist" );
523
+
524
+ this ->ReduceHist (p_fmat, candidates, build_nidx, subtraction_nidx);
525
+
475
526
monitor.Stop (__func__);
476
527
}
477
528
478
529
// After tree update is finished, update the position of all training
479
530
// instances to their final leaf. This information is used later to update the
480
531
// prediction cache
481
- void FinalisePosition (DMatrix* p_fmat, RegTree const * p_tree, ObjInfo task, bst_idx_t n_samples,
532
+ void FinalisePosition (DMatrix* p_fmat, RegTree const * p_tree, ObjInfo task,
482
533
HostDeviceVector<bst_node_t >* p_out_position) {
483
534
if (!p_fmat->SingleColBlock () && task.UpdateTreeLeaf ()) {
484
535
LOG (FATAL) << " Current objective function can not be used with external memory." ;
485
536
}
486
- if (p_fmat->Info (). num_row_ != n_samples ) {
537
+ if (static_cast <std:: size_t >( p_fmat->NumBatches () + 1 ) != this -> batch_ptr_ . size () ) {
487
538
// External memory with concatenation. Not supported.
488
539
p_out_position->Resize (0 );
489
540
positions_.clear ();
@@ -577,60 +628,6 @@ struct GPUHistMakerDevice {
577
628
return true ;
578
629
}
579
630
580
- /* *
581
- * \brief Build GPU local histograms for the left and right child of some parent node
582
- */
583
- void BuildHistLeftRight (DMatrix* p_fmat, std::vector<GPUExpandEntry> const & candidates,
584
- const RegTree& tree) {
585
- if (candidates.empty ()) {
586
- return ;
587
- }
588
- this ->monitor .Start (__func__);
589
- // Some nodes we will manually compute histograms
590
- // others we will do by subtraction
591
- std::vector<bst_node_t > hist_nidx (candidates.size ());
592
- std::vector<bst_node_t > subtraction_nidx (candidates.size ());
593
- auto prefetch_copy =
594
- AssignNodes (&tree, this ->quantiser .get (), candidates, hist_nidx, subtraction_nidx);
595
-
596
- std::vector<int > all_new = hist_nidx;
597
- all_new.insert (all_new.end (), subtraction_nidx.begin (), subtraction_nidx.end ());
598
- // Allocate the histograms
599
- // Guaranteed contiguous memory
600
- histogram_.AllocateHistograms (ctx_, all_new);
601
-
602
- std::int32_t k = 0 ;
603
- for (auto const & page : p_fmat->GetBatches <EllpackPage>(ctx_, StaticBatch (prefetch_copy))) {
604
- for (auto nidx : hist_nidx) {
605
- this ->BuildHist (page, k, nidx);
606
- }
607
- ++k;
608
- }
609
-
610
- // Reduce all in one go
611
- // This gives much better latency in a distributed setting
612
- // when processing a large batch
613
- this ->histogram_ .AllReduceHist (ctx_, p_fmat->Info (), hist_nidx.at (0 ), hist_nidx.size ());
614
-
615
- for (size_t i = 0 ; i < subtraction_nidx.size (); i++) {
616
- auto build_hist_nidx = hist_nidx.at (i);
617
- auto subtraction_trick_nidx = subtraction_nidx.at (i);
618
- auto parent_nidx = candidates.at (i).nid ;
619
-
620
- if (!this ->histogram_ .SubtractionTrick (parent_nidx, build_hist_nidx,
621
- subtraction_trick_nidx)) {
622
- // Calculate other histogram manually
623
- std::int32_t k = 0 ;
624
- for (auto const & page : p_fmat->GetBatches <EllpackPage>(ctx_, StaticBatch (true ))) {
625
- this ->BuildHist (page, k, subtraction_trick_nidx);
626
- ++k;
627
- }
628
- this ->histogram_ .AllReduceHist (ctx_, p_fmat->Info (), subtraction_trick_nidx, 1 );
629
- }
630
- }
631
- this ->monitor .Stop (__func__);
632
- }
633
-
634
631
void ApplySplit (const GPUExpandEntry& candidate, RegTree* p_tree) {
635
632
RegTree& tree = *p_tree;
636
633
@@ -681,8 +678,9 @@ struct GPUHistMakerDevice {
681
678
}
682
679
683
680
GPUExpandEntry InitRoot (DMatrix* p_fmat, RegTree* p_tree) {
684
- constexpr bst_node_t kRootNIdx = 0 ;
685
- dh::XGBCachingDeviceAllocator<char > alloc;
681
+ this ->monitor .Start (__func__);
682
+
683
+ constexpr bst_node_t kRootNIdx = RegTree::kRoot ;
686
684
auto quantiser = *this ->quantiser ;
687
685
auto gpair_it = dh::MakeTransformIterator<GradientPairInt64>(
688
686
dh::tbegin (gpair),
@@ -697,6 +695,7 @@ struct GPUHistMakerDevice {
697
695
698
696
histogram_.AllocateHistograms (ctx_, {kRootNIdx });
699
697
std::int32_t k = 0 ;
698
+ CHECK_EQ (p_fmat->NumBatches (), this ->partitioners_ .size ());
700
699
for (auto const & page : p_fmat->GetBatches <EllpackPage>(ctx_, StaticBatch (true ))) {
701
700
this ->BuildHist (page, k, kRootNIdx );
702
701
++k;
@@ -712,25 +711,18 @@ struct GPUHistMakerDevice {
712
711
713
712
// Generate first split
714
713
auto root_entry = this ->EvaluateRootSplit (p_fmat, root_sum_quantised);
714
+
715
+ this ->monitor .Stop (__func__);
715
716
return root_entry;
716
717
}
717
718
718
719
void UpdateTree (HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const * task,
719
720
RegTree* p_tree, HostDeviceVector<bst_node_t >* p_out_position) {
720
- bool const is_single_block = p_fmat->SingleColBlock ();
721
- bst_idx_t const n_samples = p_fmat->Info ().num_row_ ;
722
-
723
- auto & tree = *p_tree;
724
721
// Process maximum 32 nodes at a time
725
722
Driver<GPUExpandEntry> driver (param, 32 );
726
723
727
- monitor.Start (" Reset" );
728
724
p_fmat = this ->Reset (gpair_all, p_fmat);
729
- monitor.Stop (" Reset" );
730
-
731
- monitor.Start (" InitRoot" );
732
725
driver.Push ({this ->InitRoot (p_fmat, p_tree)});
733
- monitor.Stop (" InitRoot" );
734
726
735
727
// The set of leaves that can be expanded asynchronously
736
728
auto expand_set = driver.Pop ();
@@ -740,20 +732,17 @@ struct GPUHistMakerDevice {
740
732
}
741
733
// Get the candidates we are allowed to expand further
742
734
// e.g. We do not bother further processing nodes whose children are beyond max depth
743
- std::vector<GPUExpandEntry> filtered_expand_set ;
744
- std::copy_if (expand_set.begin (), expand_set.end (), std::back_inserter (filtered_expand_set ),
745
- [&](const auto & e) { return driver.IsChildValid (e); });
735
+ std::vector<GPUExpandEntry> valid_candidates ;
736
+ std::copy_if (expand_set.begin (), expand_set.end (), std::back_inserter (valid_candidates ),
737
+ [&](auto const & e) { return driver.IsChildValid (e); });
746
738
739
+ // Allocaate children nodes.
747
740
auto new_candidates =
748
- pinned.GetSpan <GPUExpandEntry>(filtered_expand_set.size () * 2 , GPUExpandEntry{});
749
- // Update all the nodes if working with external memory, this saves us from working
750
- // with the finalize position call, which adds an additional iteration and requires
751
- // special handling for row index.
752
- this ->UpdatePosition (p_fmat, is_single_block ? filtered_expand_set : expand_set, p_tree);
741
+ pinned.GetSpan <GPUExpandEntry>(valid_candidates.size () * 2 , GPUExpandEntry ());
753
742
754
- this ->BuildHistLeftRight (p_fmat, filtered_expand_set, tree );
743
+ this ->PartitionAndBuildHist (p_fmat, expand_set, valid_candidates, p_tree );
755
744
756
- this ->EvaluateSplits (p_fmat, filtered_expand_set , *p_tree, new_candidates);
745
+ this ->EvaluateSplits (p_fmat, valid_candidates , *p_tree, new_candidates);
757
746
dh::DefaultStream ().Sync ();
758
747
759
748
driver.Push (new_candidates.begin (), new_candidates.end ());
@@ -764,10 +753,10 @@ struct GPUHistMakerDevice {
764
753
// be spliable before evaluation but invalid after evaluation as we have more
765
754
// restrictions like min loss change after evalaution. Therefore, the check condition
766
755
// is greater than or equal to.
767
- if (is_single_block ) {
756
+ if (p_fmat-> SingleColBlock () ) {
768
757
CHECK_GE (p_tree->NumNodes (), this ->partitioners_ .front ()->GetNumNodes ());
769
758
}
770
- this ->FinalisePosition (p_fmat, p_tree, *task, n_samples, p_out_position);
759
+ this ->FinalisePosition (p_fmat, p_tree, *task, p_out_position);
771
760
}
772
761
};
773
762
0 commit comments