IntelPython
diff --git a/‎CMakeLists.txt
Lines changed: 11 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 11 additions & 2 deletions
diff --git a/‎demo/rmm_plugin/README.rst
Lines changed: 2 additions & 1 deletion b/‎demo/rmm_plugin/README.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/build.rst
Lines changed: 4 additions & 2 deletions b/‎doc/build.rst
Lines changed: 4 additions & 2 deletions
diff --git a/‎doc/jvm/java_intro.rst
Lines changed: 2 additions & 2 deletions b/‎doc/jvm/java_intro.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/jvm/xgboost4j_spark_gpu_tutorial.rst
Lines changed: 23 additions & 0 deletions b/‎doc/jvm/xgboost4j_spark_gpu_tutorial.rst
Lines changed: 23 additions & 0 deletions
diff --git a/‎doc/jvm/xgboost4j_spark_tutorial.rst
Lines changed: 27 additions & 0 deletions b/‎doc/jvm/xgboost4j_spark_tutorial.rst
Lines changed: 27 additions & 0 deletions
diff --git a/‎jvm-packages/.gitignore
Lines changed: 1 addition & 0 deletions b/‎jvm-packages/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎jvm-packages/create_jni.py
Lines changed: 7 additions & 0 deletions b/‎jvm-packages/create_jni.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎jvm-packages/pom.xml
Lines changed: 3 additions & 0 deletions b/‎jvm-packages/pom.xml
Lines changed: 3 additions & 0 deletions
diff --git a/‎jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumnBatch.java
Lines changed: 11 additions & 0 deletions b/‎jvm-packages/xgboost4j-spark-gpu/src/main/java/ml/dmlc/xgboost4j/java/CudfColumnBatch.java
Lines changed: 11 additions & 0 deletions
@@ -112,10 +112,19 @@ option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
 if(USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
   message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
 endif()
-if(USE_NCCL AND NOT (USE_CUDA))
+if(USE_NVTX AND (NOT USE_CUDA))
+  message(SEND_ERROR "`USE_NVTX` must be enabled with `USE_CUDA` flag.")
+endif()
+if(USE_NVTX)
+  if(CMAKE_VERSION VERSION_LESS "3.25.0")
+    # CUDA:nvtx3 target is added in 3.25
+    message("cmake >= 3.25 is required for NVTX.")
+  endif()
+endif()
+if(USE_NCCL AND (NOT USE_CUDA))
   message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
 endif()
-if(USE_DEVICE_DEBUG AND NOT (USE_CUDA))
+if(USE_DEVICE_DEBUG AND (NOT USE_CUDA))
   message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
 endif()
 if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
 
@@ -58,7 +58,8 @@ Since with RMM the memory pool is pre-allocated on a specific device, changing t
 device ordinal in XGBoost can result in memory error ``cudaErrorIllegalAddress``. Use the
 ``CUDA_VISIBLE_DEVICES`` environment variable instead of the ``device="cuda:1"`` parameter
 for selecting device. For distributed training, the distributed computing frameworks like
-``dask-cuda`` are responsible for device management.
+``dask-cuda`` are responsible for device management. For Scala-Spark, see
+:doc:`/jvm/xgboost4j_spark_gpu_tutorial` for more info.
 
 ************************
 Memory Over-Subscription
 
@@ -394,7 +394,8 @@ Additional System-dependent Features
 - OpenMP on MacOS: See :ref:`running_cmake_and_build` for installing ``openmp``. The flag
   -``mvn -Duse.openmp=OFF`` can be used to disable OpenMP support.
 - GPU support can be enabled by passing an additional flag to maven ``mvn -Duse.cuda=ON
-  install``. See :ref:`build_gpu_support` for more info.
+  install``. See :ref:`build_gpu_support` for more info. In addition, ``-Dplugin.rmm=ON``
+  can enable the optional RMM support.
 
 **************************
 Building the Documentation
@@ -414,4 +415,5 @@ build it locally, you need a installed XGBoost with all its dependencies along w
 
 Under ``xgboost/doc`` directory, run ``make <format>`` with ``<format>`` replaced by the
 format you want.  For a list of supported formats, run ``make help`` under the same
-directory.
+directory. This builds a partial document for Python but not other language bindings. To
+build the full document, see :doc:`/contrib/docs`.
@@ -127,7 +127,7 @@ With parameters and data, you are able to train a booster model.
 
   .. code-block:: java
 
-    booster.saveModel("model.bin");
+    booster.saveModel("model.json");
 
 * Generating model dump with feature map
 
@@ -142,7 +142,7 @@ With parameters and data, you are able to train a booster model.
 
   .. code-block:: java
 
-    Booster booster = XGBoost.loadModel("model.bin");
+    Booster booster = XGBoost.loadModel("model.json");
 
 **********
 Prediction
 
@@ -259,3 +259,26 @@ For details about other ``RAPIDS Accelerator`` other configurations, please refe
 
 For ``RAPIDS Accelerator Frequently Asked Questions``, please refer to the
 `frequently-asked-questions <https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html>`_.
+
+***********
+RMM Support
+***********
+
+.. versionadded:: 3.0
+
+When compiled with the RMM plugin (see :doc:`/build`), the XGBoost spark package can reuse
+the RMM memory pool automatically based on `spark.rapids.memory.gpu.pooling.enabled` and
+`spark.rapids.memory.gpu.pool`. Please note that both submit options need to be set
+accordingly. In addition, XGBoost employs NCCL for GPU communication, which requires some
+GPU memory for communication buffers and one should not let RMM take all the available
+memory. Example configuration related to memory pool:
+
+.. code-block:: bash
+
+  spark-submit \
+    --master $master \
+    --conf spark.rapids.memory.gpu.allocFraction=0.5 \
+    --conf spark.rapids.memory.gpu.maxAllocFraction=0.8 \
+    --conf spark.rapids.memory.gpu.pool=ARENA \
+    --conf spark.rapids.memory.gpu.pooling.enabled=true \
+    ...
@@ -561,3 +561,30 @@ An equivalent way is to pass in parameters in XGBoostClassifier's constructor:
 
 If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint
 file in ``/checkpoints_path`` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds.
+
+
+***************
+External Memory
+***************
+
+.. versionadded:: 3.0
+
+.. warning::
+
+   The feature is experimental.
+
+Here we refer to the iterator-based external memory instead of the one that uses special
+URL parameters. XGBoost-Spark has experimental support for GPU-based external memory
+training (:doc:`/jvm/xgboost4j_spark_gpu_tutorial`) since 3.0. When it's used in
+combination with GPU-based training, data is first cached on disk and then staged on CPU
+memory.  See :doc:`/tutorials/external_memory` for general concept and best practices for
+the external memory training. In addition, see the doc string of the estimator parameter
+`useExternalMemory`. With Spark estimators:
+
+.. code-block:: scala
+
+  val xgbClassifier = new XGBoostClassifier(xgbParam)
+      .setFeaturesCol(featuresNames)
+      .setLabelCol(labelName)
+      .setUseExternalMemory(true)
+      .setDevice("cuda")  // CPU is not yet supported
@@ -2,3 +2,4 @@ build.sh
 xgboost4j-tester/pom.xml
 xgboost4j-tester/iris.csv
 dependency-reduced-pom.xml
+.factorypath
@@ -73,6 +73,10 @@ def native_build(cli_args: argparse.Namespace) -> None:
         os.environ["JAVA_HOME"] = (
             subprocess.check_output("/usr/libexec/java_home").strip().decode()
         )
+    if cli_args.use_debug == "ON":
+        CONFIG["CMAKE_BUILD_TYPE"] = "Debug"
+    CONFIG["USE_NVTX"] = cli_args.use_nvtx
+    CONFIG["PLUGIN_RMM"] = cli_args.plugin_rmm
 
     print("building Java wrapper", flush=True)
     with cd(".."):
@@ -187,5 +191,8 @@ def native_build(cli_args: argparse.Namespace) -> None:
     )
     parser.add_argument("--use-cuda", type=str, choices=["ON", "OFF"], default="OFF")
     parser.add_argument("--use-openmp", type=str, choices=["ON", "OFF"], default="ON")
+    parser.add_argument("--use-debug", type=str, choices=["ON", "OFF"], default="OFF")
+    parser.add_argument("--use-nvtx", type=str, choices=["ON", "OFF"], default="OFF")
+    parser.add_argument("--plugin-rmm", type=str, choices=["ON", "OFF"], default="OFF")
     cli_args = parser.parse_args()
     native_build(cli_args)
@@ -57,6 +57,9 @@
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
         <use.openmp>ON</use.openmp>
+        <use.debug>OFF</use.debug>
+        <use.nvtx>OFF</use.nvtx>
+        <plugin.rmm>OFF</plugin.rmm>
         <cudf.version>24.10.0</cudf.version>
         <spark.rapids.version>24.10.0</spark.rapids.version>
         <spark.rapids.classifier>cuda12</spark.rapids.classifier>
 
@@ -86,6 +86,17 @@ private List<CudfColumn> initializeCudfColumns(Table table) {
       .collect(Collectors.toList());
   }
 
+  // visible for testing
+  public Table getFeatureTable() {
+    return featureTable;
+  }
+
+  // visible for testing
+  public Table getLabelTable() {
+    return labelTable;
+  }
+
+
   public List<CudfColumn> getFeatures() {
     return features;
   }