Skip to content

Commit 3f727c2

Browse files
authored
[EM] Update demos for new dependencies. (dmlc#11234)
Drop the checks for old version of python-cuda and rmm.
1 parent a9a8461 commit 3f727c2

File tree

2 files changed

+72
-48
lines changed

2 files changed

+72
-48
lines changed

demo/guide-python/distributed_extmem_basic.py

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@
2121
import argparse
2222
import multiprocessing as mp
2323
import os
24+
import sys
2425
import tempfile
25-
from functools import partial, update_wrapper
26-
from typing import Callable, List, Tuple
26+
import traceback
27+
from functools import partial, update_wrapper, wraps
28+
from typing import Callable, List, ParamSpec, Tuple, TypeVar
2729

2830
import numpy as np
2931
from loky import get_reusable_executor
@@ -98,34 +100,55 @@ def reset(self) -> None:
98100

99101

100102
def setup_rmm() -> None:
101-
"""Setup RMM for GPU-based external memory training."""
103+
"""Setup RMM for GPU-based external memory training.
104+
105+
It's important to use RMM with `CudaAsyncMemoryResource` or `ArenaMemoryResource`
106+
for GPU-based external memory to improve performance. If XGBoost is not built with
107+
RMM support, a warning is raised when constructing the `DMatrix`.
108+
109+
"""
102110
import rmm
111+
from cuda import cudart
103112
from rmm.allocators.cupy import rmm_cupy_allocator
113+
from rmm.mr import ArenaMemoryResource
104114

105115
if not xgboost.build_info()["USE_RMM"]:
106116
return
107117

108-
try:
109-
# Use the arena pool if available
110-
from cuda.bindings import runtime as cudart
111-
from rmm.mr import ArenaMemoryResource
112-
113-
status, free, total = cudart.cudaMemGetInfo()
114-
if status != cudart.cudaError_t.cudaSuccess:
115-
raise RuntimeError(cudart.cudaGetErrorString(status))
116-
117-
mr = rmm.mr.CudaMemoryResource()
118-
mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
119-
except ImportError:
120-
# The combination of pool and async is by design. As XGBoost needs to allocate
121-
# large pages repeatly, it's not easy to handle fragmentation. We can use more
122-
# experiments here.
123-
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
124-
rmm.mr.set_current_device_resource(mr)
118+
status, free, total = cudart.cudaMemGetInfo()
119+
if status != cudart.cudaError_t.cudaSuccess:
120+
raise RuntimeError(cudart.cudaGetErrorString(status))
121+
122+
mr = rmm.mr.CudaMemoryResource()
123+
mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
124+
125+
rmm.mr.set_current_device_resource(mr)
125126
# Set the allocator for cupy as well.
126127
cp.cuda.set_allocator(rmm_cupy_allocator)
127128

128129

130+
R = TypeVar("R")
131+
P = ParamSpec("P")
132+
133+
134+
def try_run(fn: Callable[P, R]) -> Callable[P, R]:
135+
"""Loky aborts the process without printing out any error message if there's an
136+
exception.
137+
138+
"""
139+
140+
@wraps(fn)
141+
def inner(*args: P.args, **kwargs: P.kwargs) -> R:
142+
try:
143+
return fn(*args, **kwargs)
144+
except Exception as e:
145+
print(traceback.format_exc(), file=sys.stderr)
146+
raise RuntimeError("Running into exception in worker.") from e
147+
148+
return inner
149+
150+
151+
@try_run
129152
def hist_train(worker_idx: int, tmpdir: str, device: str, rabit_args: dict) -> None:
130153
"""The hist tree method can use a special data structure `ExtMemQuantileDMatrix` for
131154
faster initialization and lower memory usage.
@@ -153,7 +176,11 @@ def hist_train(worker_idx: int, tmpdir: str, device: str, rabit_args: dict) -> N
153176
)
154177
# Check the device is correctly set.
155178
if device == "cuda":
156-
assert int(os.environ["CUDA_VISIBLE_DEVICES"]) < coll.get_world_size()
179+
# Check the first device
180+
assert (
181+
int(os.environ["CUDA_VISIBLE_DEVICES"].split(",")[0])
182+
< coll.get_world_size()
183+
)
157184
booster = xgboost.train(
158185
{
159186
"tree_method": "hist",
@@ -180,8 +207,12 @@ def initializer(device: str) -> None:
180207
if device == "cuda":
181208
# name: LokyProcess-1
182209
lop, sidx = mp.current_process().name.split("-")
183-
idx = int(sidx) # 1-based indexing from loky
184-
os.environ["CUDA_VISIBLE_DEVICES"] = str(idx - 1)
210+
idx = int(sidx) - 1 # 1-based indexing from loky
211+
# Assuming two workers for demo.
212+
devices = ",".join([str(idx), str((idx + 1) % n_workers)])
213+
# P0: CUDA_VISIBLE_DEVICES=0,1
214+
# P1: CUDA_VISIBLE_DEVICES=1,0
215+
os.environ["CUDA_VISIBLE_DEVICES"] = devices
185216
setup_rmm()
186217

187218
with get_reusable_executor(
@@ -204,10 +235,6 @@ def initializer(device: str) -> None:
204235
if args.device == "cuda":
205236
import cupy as cp
206237

207-
# It's important to use RMM with `CudaAsyncMemoryResource`. for GPU-based
208-
# external memory to improve performance. If XGBoost is not built with RMM
209-
# support, a warning is raised when constructing the `DMatrix`.
210-
setup_rmm()
211238
with tempfile.TemporaryDirectory() as tmpdir:
212239
main(tmpdir, args)
213240
else:

demo/guide-python/external_memory.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -154,30 +154,30 @@ def main(tmpdir: str, args: argparse.Namespace) -> None:
154154

155155

156156
def setup_rmm() -> None:
157-
"""Setup RMM for GPU-based external memory training."""
157+
"""Setup RMM for GPU-based external memory training.
158+
159+
It's important to use RMM with `CudaAsyncMemoryResource` or `ArenaMemoryResource`
160+
for GPU-based external memory to improve performance. If XGBoost is not built with
161+
RMM support, a warning is raised when constructing the `DMatrix`.
162+
163+
"""
164+
158165
import rmm
166+
from cuda import cudart
159167
from rmm.allocators.cupy import rmm_cupy_allocator
168+
from rmm.mr import ArenaMemoryResource
160169

161170
if not xgboost.build_info()["USE_RMM"]:
162171
return
163172

164-
try:
165-
# Use the arena pool if available
166-
from cuda.bindings import runtime as cudart
167-
from rmm.mr import ArenaMemoryResource
168-
169-
status, free, total = cudart.cudaMemGetInfo()
170-
if status != cudart.cudaError_t.cudaSuccess:
171-
raise RuntimeError(cudart.cudaGetErrorString(status))
172-
173-
mr = rmm.mr.CudaMemoryResource()
174-
mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
175-
except ImportError:
176-
# The combination of pool and async is by design. As XGBoost needs to allocate
177-
# large pages repeatly, it's not easy to handle fragmentation. We can use more
178-
# experiments here.
179-
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
180-
rmm.mr.set_current_device_resource(mr)
173+
status, free, total = cudart.cudaMemGetInfo()
174+
if status != cudart.cudaError_t.cudaSuccess:
175+
raise RuntimeError(cudart.cudaGetErrorString(status))
176+
177+
mr = rmm.mr.CudaMemoryResource()
178+
mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
179+
180+
rmm.mr.set_current_device_resource(mr)
181181
# Set the allocator for cupy as well.
182182
cp.cuda.set_allocator(rmm_cupy_allocator)
183183

@@ -189,9 +189,6 @@ def setup_rmm() -> None:
189189
if args.device == "cuda":
190190
import cupy as cp
191191

192-
# It's important to use RMM with `CudaAsyncMemoryResource`. for GPU-based
193-
# external memory to improve performance. If XGBoost is not built with RMM
194-
# support, a warning is raised when constructing the `DMatrix`.
195192
setup_rmm()
196193
# Make sure XGBoost is using RMM for all allocations.
197194
with xgboost.config_context(use_rmm=True):

0 commit comments

Comments
 (0)