IntelPython
diff --git a/‎include/xgboost/data.h
Lines changed: 5 additions & 2 deletions b/‎include/xgboost/data.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/common/device_helpers.cuh
Lines changed: 8 additions & 18 deletions b/‎src/common/device_helpers.cuh
Lines changed: 8 additions & 18 deletions
diff --git a/‎src/data/extmem_quantile_dmatrix.cc
Lines changed: 2 additions & 0 deletions b/‎src/data/extmem_quantile_dmatrix.cc
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/data/extmem_quantile_dmatrix.h
Lines changed: 2 additions & 1 deletion b/‎src/data/extmem_quantile_dmatrix.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/data/iterative_dmatrix.h
Lines changed: 0 additions & 2 deletions b/‎src/data/iterative_dmatrix.h
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/data/proxy_dmatrix.h
Lines changed: 0 additions & 1 deletion b/‎src/data/proxy_dmatrix.h
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/data/simple_dmatrix.h
Lines changed: 0 additions & 1 deletion b/‎src/data/simple_dmatrix.h
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/data/sparse_page_dmatrix.h
Lines changed: 1 addition & 2 deletions b/‎src/data/sparse_page_dmatrix.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/data/sparse_page_source.cc
Lines changed: 6 additions & 2 deletions b/‎src/data/sparse_page_source.cc
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/data/sparse_page_source.cu
Lines changed: 10 additions & 0 deletions b/‎src/data/sparse_page_source.cu
Lines changed: 10 additions & 0 deletions
@@ -541,9 +541,12 @@ class DMatrix {
   [[nodiscard]] bool PageExists() const;
 
   /**
-   * @return Whether the data columns single column block.
+   * @return Whether the contains a single batch.
+   *
+   * The naming is legacy.
    */
-  [[nodiscard]] virtual bool SingleColBlock() const = 0;
+  [[nodiscard]] bool SingleColBlock() const { return this->NumBatches() == 1; }
+  [[nodiscard]] virtual std::int32_t NumBatches() const { return 1; }
 
   virtual ~DMatrix();
 
 
@@ -486,24 +486,20 @@ class TypedDiscard : public thrust::discard_iterator<T> {
 } // namespace detail
 
 template <typename T>
-using TypedDiscard =
-    std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
-                       detail::TypedDiscard<T>>;
+using TypedDiscard = std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
+                                        detail::TypedDiscard<T>>;
 
 template <typename VectorT, typename T = typename VectorT::value_type,
-  typename IndexT = typename xgboost::common::Span<T>::index_type>
-xgboost::common::Span<T> ToSpan(
-    VectorT &vec,
-    IndexT offset = 0,
-    IndexT size = std::numeric_limits<size_t>::max()) {
+          typename IndexT = typename xgboost::common::Span<T>::index_type>
+xgboost::common::Span<T> ToSpan(VectorT &vec, IndexT offset = 0,
+                                IndexT size = std::numeric_limits<size_t>::max()) {
   size = size == std::numeric_limits<size_t>::max() ? vec.size() : size;
   CHECK_LE(offset + size, vec.size());
-  return {vec.data().get() + offset, size};
+  return {thrust::raw_pointer_cast(vec.data()) + offset, size};
 }
 
 template <typename T>
-xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
-                                size_t offset, size_t size) {
+xgboost::common::Span<T> ToSpan(thrust::device_vector<T> &vec, size_t offset, size_t size) {
   return ToSpan(vec, offset, size);
 }
 
@@ -874,13 +870,7 @@ inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
 
 // Changing this has effect on prediction return, where we need to pass the pointer to
 // third-party libraries like cuPy
-inline CUDAStreamView DefaultStream() {
-#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
-  return CUDAStreamView{cudaStreamPerThread};
-#else
-  return CUDAStreamView{cudaStreamLegacy};
-#endif
-}
+inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamPerThread}; }
 
 class CUDAStream {
   cudaStream_t stream_;
 
@@ -74,6 +74,8 @@ void ExtMemQuantileDMatrix::InitFromCPU(
   cpu_impl::GetDataShape(ctx, proxy, *iter, missing, &ext_info);
   ext_info.SetInfo(ctx, &this->info_);
 
+  this->n_batches_ = ext_info.n_batches;
+
   /**
    * Generate quantiles
    */
 
@@ -33,7 +33,7 @@ class ExtMemQuantileDMatrix : public QuantileDMatrix {
                         std::string cache, bst_bin_t max_bin, bool on_host);
   ~ExtMemQuantileDMatrix() override;
 
-  [[nodiscard]] bool SingleColBlock() const override { return false; }
+  [[nodiscard]] std::int32_t NumBatches() const override { return n_batches_; }
 
  private:
   void InitFromCPU(
@@ -63,6 +63,7 @@ class ExtMemQuantileDMatrix : public QuantileDMatrix {
   std::string cache_prefix_;
   bool on_host_;
   BatchParam batch_;
+  bst_idx_t n_batches_{0};
 
   using EllpackDiskPtr = std::shared_ptr<ExtEllpackPageSource>;
   using EllpackHostPtr = std::shared_ptr<ExtEllpackPageHostSource>;
 
@@ -57,8 +57,6 @@ class IterativeDMatrix : public QuantileDMatrix {
 
   BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
   BatchSet<ExtSparsePage> GetExtBatches(Context const *ctx, BatchParam const &param) override;
-
-  bool SingleColBlock() const override { return true; }
 };
 }  // namespace data
 }  // namespace xgboost
 
@@ -94,7 +94,6 @@ class DMatrixProxy : public DMatrix {
   MetaInfo const& Info() const override { return info_; }
   Context const* Ctx() const override { return &ctx_; }
 
-  bool SingleColBlock() const override { return false; }
   bool EllpackExists() const override { return false; }
   bool GHistIndexExists() const override { return false; }
   bool SparsePageExists() const override { return false; }
 
@@ -33,7 +33,6 @@ class SimpleDMatrix : public DMatrix {
   const MetaInfo& Info() const override;
   Context const* Ctx() const override { return &fmat_ctx_; }
 
-  bool SingleColBlock() const override { return true; }
   DMatrix* Slice(common::Span<int32_t const> ridxs) override;
   DMatrix* SliceCol(int num_slices, int slice_id) override;
 
 
@@ -90,8 +90,7 @@ class SparsePageDMatrix : public DMatrix {
   [[nodiscard]] MetaInfo &Info() override;
   [[nodiscard]] const MetaInfo &Info() const override;
   [[nodiscard]] Context const *Ctx() const override { return &fmat_ctx_; }
-  // The only DMatrix implementation that returns false.
-  [[nodiscard]] bool SingleColBlock() const override { return false; }
+  [[nodiscard]] std::int32_t NumBatches() const override { return n_batches_; }
   DMatrix *Slice(common::Span<std::int32_t const>) override {
     LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
     return nullptr;
 
@@ -3,10 +3,10 @@
  */
 #include "sparse_page_source.h"
 
-#include <filesystem>  // for exists
-#include <string>      // for string
 #include <cstdio>      // for remove
+#include <filesystem>  // for exists
 #include <numeric>     // for partial_sum
+#include <string>      // for string
 
 namespace xgboost::data {
 void Cache::Commit() {
@@ -27,4 +27,8 @@ void TryDeleteCacheFile(const std::string& file) {
                  << "; you may want to remove it manually";
   }
 }
+
+#if !defined(XGBOOST_USE_CUDA)
+void InitNewThread::operator()() const { *GlobalConfigThreadLocalStore::Get() = config; }
+#endif
 }  // namespace xgboost::data
@@ -18,4 +18,14 @@ void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
   cuda_impl::Dispatch(proxy,
                       [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
 }
+
+void InitNewThread::operator()() const {
+  *GlobalConfigThreadLocalStore::Get() = config;
+  // For CUDA 12.2, we need to force initialize the CUDA context by synchronizing the
+  // stream when creating a new thread in the thread pool. While for CUDA 11.8, this
+  // action might cause an insufficient driver version error for some reason. Lastly, it
+  // should work with CUDA 12.5 without any action being taken.
+
+  // dh::DefaultStream().Sync();
+}
 }  // namespace xgboost::data