5
5
.. versionadded:: 1.2.0
6
6
7
7
The demo that defines a customized iterator for passing batches of data into
8
- :py:class:`xgboost.QuantileDMatrix` and use this ``QuantileDMatrix`` for
9
- training. The feature is used primarily designed to reduce the required GPU
10
- memory for training on distributed environment.
8
+ :py:class:`xgboost.QuantileDMatrix` and use this ``QuantileDMatrix`` for training. The
9
+ feature is primarily designed to reduce the required GPU memory for training on
10
+ distributed environment.
11
11
12
- Aftering going through the demo, one might ask why don't we use more native
13
- Python iterator? That's because XGBoost requires a `reset` function, while
14
- using `itertools.tee` might incur significant memory usage according to:
12
+ Aftering going through the demo, one might ask why don't we use more native Python
13
+ iterator? That's because XGBoost requires a `reset` function, while using
14
+ `itertools.tee` might incur significant memory usage according to:
15
15
16
16
https://docs.python.org/3/library/itertools.html#itertools.tee.
17
17
18
+ .. seealso::
19
+
20
+ :ref:`sphx_glr_python_examples_external_memory.py`
21
+
18
22
"""
19
23
24
+ from typing import Callable
25
+
20
26
import cupy
21
27
import numpy
22
28
@@ -35,7 +41,7 @@ class IterForDMatrixDemo(xgboost.core.DataIter):
35
41
36
42
"""
37
43
38
- def __init__ (self ):
44
+ def __init__ (self ) -> None :
39
45
"""Generate some random data for demostration.
40
46
41
47
Actual data can be anything that is currently supported by XGBoost.
@@ -50,41 +56,44 @@ def __init__(self):
50
56
self .it = 0 # set iterator to 0
51
57
super ().__init__ ()
52
58
53
- def as_array (self ):
59
+ def as_array (self ) -> cupy . ndarray :
54
60
return cupy .concatenate (self ._data )
55
61
56
- def as_array_labels (self ):
62
+ def as_array_labels (self ) -> cupy . ndarray :
57
63
return cupy .concatenate (self ._labels )
58
64
59
- def as_array_weights (self ):
65
+ def as_array_weights (self ) -> cupy . ndarray :
60
66
return cupy .concatenate (self ._weights )
61
67
62
- def data (self ):
68
+ def data (self ) -> cupy . ndarray :
63
69
"""Utility function for obtaining current batch of data."""
64
70
return self ._data [self .it ]
65
71
66
- def labels (self ):
72
+ def labels (self ) -> cupy . ndarray :
67
73
"""Utility function for obtaining current batch of label."""
68
74
return self ._labels [self .it ]
69
75
70
- def weights (self ):
76
+ def weights (self ) -> cupy . ndarray :
71
77
return self ._weights [self .it ]
72
78
73
- def reset (self ):
79
+ def reset (self ) -> None :
74
80
"""Reset the iterator"""
75
81
self .it = 0
76
82
77
- def next (self , input_data ) :
78
- """Yield next batch of data."""
83
+ def next (self , input_data : Callable ) -> bool :
84
+ """Yield the next batch of data."""
79
85
if self .it == len (self ._data ):
80
- # Return 0 when there's no more batch.
81
- return 0
86
+ # Return False to let XGBoost know this is the end of iteration
87
+ return False
88
+
89
+ # input_data is a keyword-only function passed in by XGBoost and has the similar
90
+ # signature to the ``DMatrix`` constructor.
82
91
input_data (data = self .data (), label = self .labels (), weight = self .weights ())
83
92
self .it += 1
84
- return 1
93
+ return True
85
94
86
95
87
- def main ():
96
+ def main () -> None :
88
97
rounds = 100
89
98
it = IterForDMatrixDemo ()
90
99
@@ -103,7 +112,7 @@ def main():
103
112
104
113
assert m_with_it .num_col () == m .num_col ()
105
114
assert m_with_it .num_row () == m .num_row ()
106
- # Tree meethod must be `hist`.
115
+ # Tree method must be `hist`.
107
116
reg_with_it = xgboost .train (
108
117
{"tree_method" : "hist" , "device" : "cuda" },
109
118
m_with_it ,
0 commit comments