[EM] Pass batch parameter into extmem format. (dmlc#10736)

trivialfis · web-flow · commit 25966e4ba8e6 · 2024-08-27T02:37:50.000+08:00
- Allow customization for format reading.
- Customize the number of pre-fetch batches.
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -239,42 +239,52 @@ struct Entry {
 };
 
 /**
- * \brief Parameters for constructing histogram index batches.
+ * @brief Parameters for constructing histogram index batches.
  */
 struct BatchParam {
   /**
-   * \brief Maximum number of bins per feature for histograms.
+   * @brief Maximum number of bins per feature for histograms.
    */
   bst_bin_t max_bin{0};
   /**
-   * \brief Hessian, used for sketching with future approx implementation.
+   * @brief Hessian, used for sketching with future approx implementation.
    */
   common::Span<float const> hess;
   /**
-   * \brief Whether should we force DMatrix to regenerate the batch.  Only used for
+   * @brief Whether should we force DMatrix to regenerate the batch.  Only used for
    *        GHistIndex.
    */
   bool regen{false};
   /**
-   * \brief Forbid regenerating the gradient index. Used for internal validation.
+   * @brief Forbid regenerating the gradient index. Used for internal validation.
    */
   bool forbid_regen{false};
   /**
-   * \brief Parameter used to generate column matrix for hist.
+   * @brief Parameter used to generate column matrix for hist.
    */
   double sparse_thresh{std::numeric_limits<double>::quiet_NaN()};
+  /**
+   * @brief Used for GPU external memory. Whether to copy the data into device.
+   *
+   * This affects only the current round of iteration.
+   */
+  bool prefetch_copy{true};
+  /**
+   * @brief The number of batches to pre-fetch for external memory.
+   */
+  std::int32_t n_prefetch_batches{3};
 
   /**
-   * \brief Exact or others that don't need histogram.
+   * @brief Exact or others that don't need histogram.
    */
   BatchParam() = default;
   /**
-   * \brief Used by the hist tree method.
+   * @brief Used by the hist tree method.
    */
   BatchParam(bst_bin_t max_bin, double sparse_thresh)
       : max_bin{max_bin}, sparse_thresh{sparse_thresh} {}
   /**
-   * \brief Used by the approx tree method.
+   * @brief Used by the approx tree method.
    *
    *   Get batch with sketch weighted by hessian.  The batch will be regenerated if the
    *   span is changed, so caller should keep the span for each iteration.
@@ -295,7 +305,7 @@ struct BatchParam {
   }
   [[nodiscard]] bool Initialized() const { return max_bin != 0; }
   /**
-   * \brief Make a copy of self for DMatrix to describe how its existing index was generated.
+   * @brief Make a copy of self for DMatrix to describe how its existing index was generated.
    */
   [[nodiscard]] BatchParam MakeCache() const {
     auto p = *this;
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
@@ -60,7 +60,7 @@ template <typename T>
   RET_IF_NOT(fi->Read(&impl->is_dense));
   RET_IF_NOT(fi->Read(&impl->row_stride));
 
-  if (has_hmm_ats_) {
+  if (has_hmm_ats_ && !this->param_.prefetch_copy) {
     RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
   } else {
     RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
@@ -95,7 +95,7 @@ template <typename T>
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
   impl->SetCuts(this->cuts_);
 
-  fi->Read(page);
+  fi->Read(page, this->param_.prefetch_copy);
   dh::DefaultStream().Sync();
 
   return true;
diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h
@@ -26,13 +26,17 @@ class EllpackHostCacheStream;
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
   std::shared_ptr<common::HistogramCuts const> cuts_;
   DeviceOrd device_;
+  BatchParam param_;
   // Supports CUDA HMM or ATS
   bool has_hmm_ats_{false};
 
  public:
   explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
-                                bool has_hmm_ats)
-      : cuts_{std::move(cuts)}, device_{device}, has_hmm_ats_{has_hmm_ats} {}
+                                BatchParam param, bool has_hmm_ats)
+      : cuts_{std::move(cuts)},
+        device_{device},
+        param_{std::move(param)},
+        has_hmm_ats_{has_hmm_ats} {}
   [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override;
   [[nodiscard]] std::size_t Write(const EllpackPage& page,
                                   common::AlignedFileWriteStream* fo) override;
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
@@ -11,15 +11,13 @@
 
 #include "../common/common.h"                 // for safe_cuda
 #include "../common/ref_resource_view.cuh"
-#include "../common/cuda_pinned_allocator.h"  // for pinned_allocator
 #include "../common/device_helpers.cuh"       // for CUDAStreamView, DefaultStream
 #include "../common/resource.cuh"             // for PrivateCudaMmapConstStream
 #include "ellpack_page.cuh"                   // for EllpackPageImpl
 #include "ellpack_page.h"                     // for EllpackPage
 #include "ellpack_page_source.h"
 #include "proxy_dmatrix.cuh"  // for Dispatch
 #include "xgboost/base.h"     // for bst_idx_t
-#include "../common/cuda_rt_utils.h"  // for NvtxScopedRange
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 
 namespace xgboost::data {
@@ -91,14 +89,20 @@ class EllpackHostCacheStreamImpl {
     ptr_ += 1;
   }
 
-  void Read(EllpackPage* out) const {
+  void Read(EllpackPage* out, bool prefetch_copy) const {
     auto page = this->cache_->Get(ptr_);
 
     auto impl = out->Impl();
-    impl->gidx_buffer =
-        common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(page->gidx_buffer.size());
-    dh::safe_cuda(cudaMemcpyAsync(impl->gidx_buffer.data(), page->gidx_buffer.data(),
-                                  page->gidx_buffer.size_bytes(), cudaMemcpyDefault));
+    if (prefetch_copy) {
+      impl->gidx_buffer =
+          common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(page->gidx_buffer.size());
+      dh::safe_cuda(cudaMemcpyAsync(impl->gidx_buffer.data(), page->gidx_buffer.data(),
+                                    page->gidx_buffer.size_bytes(), cudaMemcpyDefault));
+    } else {
+      auto res = page->gidx_buffer.Resource();
+      impl->gidx_buffer = common::RefResourceView<common::CompressedByteT>{
+          res->DataAs<common::CompressedByteT>(), page->gidx_buffer.size(), res};
+    }
 
     impl->n_rows = page->Size();
     impl->is_dense = page->IsDense();
@@ -120,7 +124,9 @@ std::shared_ptr<EllpackHostCache> EllpackHostCacheStream::Share() { return p_imp
 
 void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek(offset_bytes); }
 
-void EllpackHostCacheStream::Read(EllpackPage* page) const { this->p_impl_->Read(page); }
+void EllpackHostCacheStream::Read(EllpackPage* page, bool prefetch_copy) const {
+  this->p_impl_->Read(page, prefetch_copy);
+}
 
 void EllpackHostCacheStream::Write(EllpackPage const& page) { this->p_impl_->Write(page); }
 
@@ -162,8 +168,9 @@ EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringV
 
 template std::unique_ptr<
     typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
-EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
-    StringView name, std::uint64_t offset, std::uint64_t length) const;
+EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringView name,
+                                                                         bst_idx_t offset,
+                                                                         bst_idx_t length) const;
 
 /**
  * EllpackMmapStreamPolicy
@@ -233,6 +240,7 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
     ++(*this->source_);
     CHECK_GE(this->source_->Iter(), 1);
     cuda_impl::Dispatch(proxy_, [this](auto const& value) {
+      CHECK(this->proxy_->Ctx()->IsCUDA()) << "All batches must use the same device type.";
       proxy_->Info().feature_types.SetDevice(dh::GetDevice(this->ctx_));
       auto d_feature_types = proxy_->Info().feature_types.ConstDeviceSpan();
       auto n_samples = value.NumRows();
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
@@ -53,7 +53,7 @@ class EllpackHostCacheStream {
 
   void Seek(bst_idx_t offset_bytes);
 
-  void Read(EllpackPage* page) const;
+  void Read(EllpackPage* page, bool prefetch_copy) const;
   void Write(EllpackPage const& page);
 };
 
@@ -71,9 +71,9 @@ class EllpackFormatPolicy {
   // For testing with the HMM flag.
   explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
 
-  [[nodiscard]] auto CreatePageFormat() const {
+  [[nodiscard]] auto CreatePageFormat(BatchParam const& param) const {
     CHECK_EQ(cuts_->cut_values_.Device(), device_);
-    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_, has_hmm_}};
+    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_, param, has_hmm_}};
     return fmt;
   }
 
diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc
@@ -66,6 +66,8 @@ void ExtMemQuantileDMatrix::InitFromCPU(
     Context const *ctx,
     std::shared_ptr<DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>> iter,
     DMatrixHandle proxy_handle, BatchParam const &p, float missing, std::shared_ptr<DMatrix> ref) {
+  xgboost_NVTX_FN_RANGE();
+
   auto proxy = MakeProxy(proxy_handle);
   CHECK(proxy);
 
@@ -118,7 +120,7 @@ BatchSet<GHistIndexMatrix> ExtMemQuantileDMatrix::GetGradientIndex(Context const
   }
 
   CHECK(this->ghist_index_source_);
-  this->ghist_index_source_->Reset();
+  this->ghist_index_source_->Reset(param);
 
   if (!std::isnan(param.sparse_thresh) &&
       param.sparse_thresh != tree::TrainParam::DftSparseThreshold()) {
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
@@ -11,6 +11,7 @@
 #include "proxy_dmatrix.h"    // for DataIterProxy
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"     // for BatchParam
+#include "../common/cuda_rt_utils.h"
 
 namespace xgboost::data {
 void ExtMemQuantileDMatrix::InitFromCUDA(
@@ -78,9 +79,9 @@ BatchSet<EllpackPage> ExtMemQuantileDMatrix::GetEllpackBatches(Context const *,
   }
 
   std::visit(
-      [this](auto &&ptr) {
+      [this, param](auto &&ptr) {
         CHECK(ptr);
-        ptr->Reset();
+        ptr->Reset(param);
       },
       this->ellpack_page_source_);
 
diff --git a/src/data/gradient_index_page_source.cc b/src/data/gradient_index_page_source.cc
@@ -37,6 +37,7 @@ void ExtGradientIndexPageSource::Fetch() {
     CHECK_GE(source_->Iter(), 1);
     CHECK_NE(cuts_.Values().size(), 0);
     HostAdapterDispatch(proxy_, [this](auto const& value) {
+      CHECK(this->proxy_->Ctx()->IsCPU()) << "All batches must use the same device type.";
       // This does three things:
       // - Generate CSR matrix for gradient index.
       // - Generate the column matrix for gradient index.
diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h
@@ -31,7 +31,7 @@ class GHistIndexFormatPolicy {
   using FormatT = SparsePageFormat<GHistIndexMatrix>;
 
  public:
-  [[nodiscard]] auto CreatePageFormat() const {
+  [[nodiscard]] auto CreatePageFormat(BatchParam const&) const {
     std::unique_ptr<FormatT> fmt{new GHistIndexRawFormat{cuts_}};
     return fmt;
   }
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
@@ -82,7 +82,7 @@ void SparsePageDMatrix::InitializeSparsePage(Context const *ctx) {
   // release the iterator and data.
   if (cache_info_.at(id)->written) {
     CHECK(sparse_page_source_);
-    sparse_page_source_->Reset();
+    sparse_page_source_->Reset({});
     return;
   }
 
@@ -114,7 +114,7 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
         std::make_shared<CSCPageSource>(this->missing_, ctx->Threads(), this->Info().num_col_,
                                         this->n_batches_, cache_info_.at(id), sparse_page_source_);
   } else {
-    column_source_->Reset();
+    column_source_->Reset({});
   }
   return BatchSet{BatchIterator<CSCPage>{this->column_source_}};
 }
@@ -129,7 +129,7 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const
         this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
         sparse_page_source_);
   } else {
-    sorted_column_source_->Reset();
+    sorted_column_source_->Reset({});
   }
   return BatchSet{BatchIterator<SortedCSCPage>{this->sorted_column_source_}};
 }
@@ -161,7 +161,7 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
         param, std::move(cuts), this->IsDense(), ft, sparse_page_source_));
   } else {
     CHECK(ghist_index_source_);
-    ghist_index_source_->Reset();
+    ghist_index_source_->Reset(param);
   }
   return BatchSet{BatchIterator<GHistIndexMatrix>{this->ghist_index_source_}};
 }
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
@@ -61,7 +61,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
         ellpack_page_source_);
   } else {
     CHECK(sparse_page_source_);
-    std::visit([&](auto&& ptr) { ptr->Reset(); }, this->ellpack_page_source_);
+    std::visit([&](auto&& ptr) { ptr->Reset(param); }, this->ellpack_page_source_);
   }
 
   auto batch_set =
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
diff --git a/tests/cpp/data/test_sparse_page_raw_format.cc b/tests/cpp/data/test_sparse_page_raw_format.cc

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ class GHistIndexFormatPolicy {`
`31`	`31`	`using FormatT = SparsePageFormat<GHistIndexMatrix>;`
`32`	`32`
`33`	`33`	`public:`
`34`		`- [[nodiscard]] auto CreatePageFormat() const {`
	`34`	`+ [[nodiscard]] auto CreatePageFormat(BatchParam const&) const {`
`35`	`35`	`std::unique_ptr<FormatT> fmt{new GHistIndexRawFormat{cuts_}};`
`36`	`36`	`return fmt;`
`37`	`37`	`}`
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ void SparsePageDMatrix::InitializeSparsePage(Context const *ctx) {`
`82`	`82`	`// release the iterator and data.`
`83`	`83`	`if (cache_info_.at(id)->written) {`
`84`	`84`	`CHECK(sparse_page_source_);`
`85`		`- sparse_page_source_->Reset();`
	`85`	`+ sparse_page_source_->Reset({});`
`86`	`86`	`return;`
`87`	`87`	`}`
`88`	`88`
`@@ -114,7 +114,7 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {`
`114`	`114`	`std::make_shared<CSCPageSource>(this->missing_, ctx->Threads(), this->Info().num_col_,`
`115`	`115`	`this->n_batches_, cache_info_.at(id), sparse_page_source_);`
`116`	`116`	`} else {`
`117`		`- column_source_->Reset();`
	`117`	`+ column_source_->Reset({});`
`118`	`118`	`}`
`119`	`119`	`return BatchSet{BatchIterator<CSCPage>{this->column_source_}};`
`120`	`120`	`}`
`@@ -129,7 +129,7 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const`
`129`	`129`	`this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),`
`130`	`130`	`sparse_page_source_);`
`131`	`131`	`} else {`
`132`		`- sorted_column_source_->Reset();`
	`132`	`+ sorted_column_source_->Reset({});`
`133`	`133`	`}`
`134`	`134`	`return BatchSet{BatchIterator<SortedCSCPage>{this->sorted_column_source_}};`
`135`	`135`	`}`
`@@ -161,7 +161,7 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct`
`161`	`161`	`param, std::move(cuts), this->IsDense(), ft, sparse_page_source_));`
`162`	`162`	`} else {`
`163`	`163`	`CHECK(ghist_index_source_);`
`164`		`- ghist_index_source_->Reset();`
	`164`	`+ ghist_index_source_->Reset(param);`
`165`	`165`	`}`
`166`	`166`	`return BatchSet{BatchIterator<GHistIndexMatrix>{this->ghist_index_source_}};`
`167`	`167`	`}`