Skip to content

Commit 1caa932

Browse files
authored
Use realloc for histogram cache and expose the cache limit. (dmlc#9455)
1 parent a57371e commit 1caa932

File tree

10 files changed

+71
-20
lines changed

10 files changed

+71
-20
lines changed

doc/parameter.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,15 @@ Parameters for Tree Booster
226226
- ``one_output_per_tree``: One model for each target.
227227
- ``multi_output_tree``: Use multi-target trees.
228228

229+
* ``max_cached_hist_node``, [default = 65536]
230+
231+
Maximum number of cached nodes for CPU histogram.
232+
233+
.. versionadded:: 2.0.0
234+
235+
- For most of the cases this parameter should not be set except for growing deep trees
236+
on CPU.
237+
229238
.. _cat-param:
230239

231240
Parameters for Categorical Feature

python-package/xgboost/testing/params.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
)
4343

4444
hist_cache_strategy = strategies.fixed_dictionaries(
45-
{"internal_max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
45+
{"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
4646
)
4747

4848
hist_multi_parameter_strategy = strategies.fixed_dictionaries(

src/common/ref_resource_view.h

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ class RefResourceView {
3535
size_type size_{0};
3636
std::shared_ptr<common::ResourceHandler> mem_{nullptr};
3737

38+
protected:
39+
void Init(value_type* ptr, size_type size, std::shared_ptr<common::ResourceHandler> mem) {
40+
ptr_ = ptr;
41+
size_ = size;
42+
mem_ = std::move(mem);
43+
}
44+
3845
public:
3946
RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
4047
: ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
@@ -60,11 +67,11 @@ class RefResourceView {
6067

6168
RefResourceView() = default;
6269
RefResourceView(RefResourceView const& that) = delete;
63-
RefResourceView(RefResourceView&& that) = delete;
6470
RefResourceView& operator=(RefResourceView const& that) = delete;
6571
/**
6672
* @brief We allow move assignment for lazy initialization.
6773
*/
74+
RefResourceView(RefResourceView&& that) = default;
6875
RefResourceView& operator=(RefResourceView&& that) = default;
6976

7077
[[nodiscard]] size_type size() const { return size_; } // NOLINT
@@ -154,5 +161,33 @@ template <typename T>
154161
auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
155162
return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
156163
}
164+
165+
template <typename T>
166+
class ReallocVector : public RefResourceView<T> {
167+
static_assert(!std::is_reference_v<T>);
168+
static_assert(!std::is_const_v<T>);
169+
static_assert(std::is_trivially_copyable_v<T>);
170+
171+
using Upper = RefResourceView<T>;
172+
using size_type = typename Upper::size_type; // NOLINT
173+
using value_type = typename Upper::value_type; // NOLINT
174+
175+
public:
176+
ReallocVector() : RefResourceView<T>{MakeFixedVecWithMalloc(0, T{})} {}
177+
178+
ReallocVector(size_type n, value_type const& init)
179+
: RefResourceView<T>{MakeFixedVecWithMalloc(n, init)} {}
180+
ReallocVector(ReallocVector const& that) = delete;
181+
ReallocVector(ReallocVector&& that) = delete;
182+
ReallocVector& operator=(ReallocVector const& that) = delete;
183+
ReallocVector& operator=(ReallocVector&& that) = delete;
184+
185+
void Resize(typename Upper::size_type new_size) {
186+
auto resource = std::dynamic_pointer_cast<common::MallocResource>(this->Resource());
187+
CHECK(resource);
188+
resource->Resize(new_size * sizeof(T));
189+
this->Init(resource->template DataAs<T>(), new_size, resource);
190+
}
191+
};
157192
} // namespace xgboost::common
158193
#endif // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_

src/tree/hist/hist_cache.h

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
66
#include <cstddef> // for size_t
77
#include <map> // for map
8+
#include <memory> // for unique_ptr
89
#include <vector> // for vector
910

10-
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
11-
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
12-
#include "xgboost/logging.h" // for CHECK_GT
13-
#include "xgboost/span.h" // for Span
11+
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
12+
#include "../../common/ref_resource_view.h" // for ReallocVector
13+
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
14+
#include "xgboost/logging.h" // for CHECK_GT
15+
#include "xgboost/span.h" // for Span
1416

1517
namespace xgboost::tree {
1618
/**
@@ -32,7 +34,8 @@ class BoundedHistCollection {
3234
std::size_t current_size_{0};
3335

3436
// stores the histograms in a contiguous buffer
35-
std::vector<GradientPairPrecise> data_;
37+
using Vec = common::ReallocVector<GradientPairPrecise>;
38+
std::unique_ptr<Vec> data_{new Vec{}}; // nvcc 12.1 trips over std::make_unique
3639

3740
// number of histogram bins across all features
3841
bst_bin_t n_total_bins_{0};
@@ -42,13 +45,14 @@ class BoundedHistCollection {
4245
bool has_exceeded_{false};
4346

4447
public:
48+
BoundedHistCollection() = default;
4549
common::GHistRow operator[](std::size_t idx) {
4650
auto offset = node_map_.at(idx);
47-
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
51+
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
4852
}
4953
common::ConstGHistRow operator[](std::size_t idx) const {
5054
auto offset = node_map_.at(idx);
51-
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
55+
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
5256
}
5357
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
5458
n_total_bins_ = n_total_bins;
@@ -81,8 +85,8 @@ class BoundedHistCollection {
8185
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
8286
auto alloc_size = n_new_nodes * n_total_bins_;
8387
auto new_size = alloc_size + current_size_;
84-
if (new_size > data_.size()) {
85-
data_.resize(new_size);
88+
if (new_size > data_->size()) {
89+
data_->Resize(new_size);
8690
}
8791
for (auto nidx : nodes_to_build) {
8892
node_map_[nidx] = current_size_;

src/tree/hist/histogram.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class HistogramBuilder {
6363
bool is_col_split, HistMakerTrainParam const *param) {
6464
n_threads_ = ctx->Threads();
6565
param_ = p;
66-
hist_.Reset(total_bins, param->internal_max_cached_hist_node);
66+
hist_.Reset(total_bins, param->max_cached_hist_node);
6767
buffer_.Init(total_bins);
6868
is_distributed_ = is_distributed;
6969
is_col_split_ = is_col_split;

src/tree/hist/param.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
1313
constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
1414

1515
bool debug_synchronize{false};
16-
std::size_t internal_max_cached_hist_node{DefaultNodes()};
16+
std::size_t max_cached_hist_node{DefaultNodes()};
1717

1818
void CheckTreesSynchronized(RegTree const* local_tree) const;
1919

@@ -22,7 +22,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
2222
DMLC_DECLARE_FIELD(debug_synchronize)
2323
.set_default(false)
2424
.describe("Check if all distributed tree are identical after tree construction.");
25-
DMLC_DECLARE_FIELD(internal_max_cached_hist_node)
25+
DMLC_DECLARE_FIELD(max_cached_hist_node)
2626
.set_default(DefaultNodes())
2727
.set_lower_bound(1)
2828
.describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");

src/tree/updater_gpu_hist.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,9 @@ class GPUGlobalApproxMaker : public TreeUpdater {
866866
// Used in test to count how many configurations are performed
867867
LOG(DEBUG) << "[GPU Approx]: Configure";
868868
hist_maker_param_.UpdateAllowUnknown(args);
869+
if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
870+
LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
871+
}
869872
dh::CheckComputeCapability();
870873
initialised_ = false;
871874

tests/cpp/tree/hist/test_evaluate_splits.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
5151
row_set_collection.Init();
5252

5353
HistMakerTrainParam hist_param;
54-
hist.Reset(gmat.cut.Ptrs().back(), hist_param.internal_max_cached_hist_node);
54+
hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
5555
hist.AllocateHistograms({0});
5656
common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
5757

@@ -118,7 +118,7 @@ TEST(HistMultiEvaluator, Evaluate) {
118118
linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
119119
for (bst_target_t t{0}; t < n_targets; ++t) {
120120
auto &hist = histogram[t];
121-
hist.Reset(n_bins * n_features, hist_param.internal_max_cached_hist_node);
121+
hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
122122
hist.AllocateHistograms({0});
123123
auto node_hist = hist[0];
124124
node_hist[0] = {-0.5, 0.5};
@@ -235,7 +235,7 @@ auto CompareOneHotAndPartition(bool onehot) {
235235
entries.front().nid = 0;
236236
entries.front().depth = 0;
237237

238-
hist.Reset(gmat.cut.TotalBins(), hist_param.internal_max_cached_hist_node);
238+
hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
239239
hist.AllocateHistograms({0});
240240
auto node_hist = hist[0];
241241

@@ -265,7 +265,7 @@ TEST(HistEvaluator, Categorical) {
265265
TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
266266
BoundedHistCollection hist;
267267
HistMakerTrainParam hist_param;
268-
hist.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
268+
hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
269269
hist.AllocateHistograms({0});
270270
auto node_hist = hist[0];
271271
ASSERT_EQ(node_hist.size(), feature_histogram_.size());

tests/cpp/tree/hist/test_histogram.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
516516
Context ctx;
517517
HistMakerTrainParam hist_param;
518518
if (limit) {
519-
hist_param.Init(Args{{"internal_max_cached_hist_node", "1"}});
519+
hist_param.Init(Args{{"max_cached_hist_node", "1"}});
520520
}
521521

522522
std::shared_ptr<DMatrix> Xy =

tests/cpp/tree/test_evaluate_splits.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
5959
cuts_.min_vals_.Resize(1);
6060

6161
HistMakerTrainParam hist_param;
62-
hist_.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
62+
hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
6363
hist_.AllocateHistograms({0});
6464
auto node_hist = hist_[0];
6565

0 commit comments

Comments
 (0)