@@ -504,8 +504,10 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
504
504
cache_prefix :
505
505
Prefix to the cache files, only used in external memory.
506
506
release_data :
507
- Whether the iterator should release the data during reset. Set it to True if the
508
- data transformation (converting data to np.float32 type) is expensive.
507
+ Whether the iterator should release the data during iteration. Set it to True if
508
+ the data transformation (converting data to np.float32 type) is memory
509
+ intensive. Otherwise, if the transformation is computation intensive then we can
510
+ keep the cache.
509
511
510
512
"""
511
513
@@ -517,15 +519,12 @@ def __init__(
517
519
self ._handle = _ProxyDMatrix ()
518
520
self ._exception : Optional [Exception ] = None
519
521
self ._enable_categorical = False
520
- self ._allow_host = True
521
522
self ._release = release_data
522
523
# Stage data in Python until reset or next is called to avoid data being free.
523
524
self ._temporary_data : Optional [TransformedData ] = None
524
525
self ._data_ref : Optional [weakref .ReferenceType ] = None
525
526
526
- def get_callbacks (
527
- self , allow_host : bool , enable_categorical : bool
528
- ) -> Tuple [Callable , Callable ]:
527
+ def get_callbacks (self , enable_categorical : bool ) -> Tuple [Callable , Callable ]:
529
528
"""Get callback functions for iterating in C. This is an internal function."""
530
529
assert hasattr (self , "cache_prefix" ), "__init__ is not called."
531
530
self ._reset_callback = ctypes .CFUNCTYPE (None , ctypes .c_void_p )(
@@ -535,7 +534,6 @@ def get_callbacks(
535
534
ctypes .c_int ,
536
535
ctypes .c_void_p ,
537
536
)(self ._next_wrapper )
538
- self ._allow_host = allow_host
539
537
self ._enable_categorical = enable_categorical
540
538
return self ._reset_callback , self ._next_callback
541
539
@@ -624,14 +622,17 @@ def input_data(
624
622
)
625
623
# Stage the data, meta info are copied inside C++ MetaInfo.
626
624
self ._temporary_data = (new , cat_codes , feature_names , feature_types )
627
- dispatch_proxy_set_data (self .proxy , new , cat_codes , self . _allow_host )
625
+ dispatch_proxy_set_data (self .proxy , new , cat_codes )
628
626
self .proxy .set_info (
629
627
feature_names = feature_names ,
630
628
feature_types = feature_types ,
631
629
** kwargs ,
632
630
)
633
631
self ._data_ref = ref
634
632
633
+ # Release the data before next batch is loaded.
634
+ if self ._release :
635
+ self ._temporary_data = None
635
636
# pylint: disable=not-callable
636
637
return self ._handle_exception (lambda : self .next (input_data ), 0 )
637
638
@@ -911,7 +912,7 @@ def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
911
912
}
912
913
args_cstr = from_pystr_to_cstr (json .dumps (args ))
913
914
handle = ctypes .c_void_p ()
914
- reset_callback , next_callback = it .get_callbacks (True , enable_categorical )
915
+ reset_callback , next_callback = it .get_callbacks (enable_categorical )
915
916
ret = _LIB .XGDMatrixCreateFromCallback (
916
917
None ,
917
918
it .proxy .handle ,
@@ -1437,37 +1438,37 @@ def __init__(self) -> None: # pylint: disable=super-init-not-called
1437
1438
self .handle = ctypes .c_void_p ()
1438
1439
_check_call (_LIB .XGProxyDMatrixCreate (ctypes .byref (self .handle )))
1439
1440
1440
- def _set_data_from_cuda_interface (self , data : DataType ) -> None :
1441
- """Set data from CUDA array interface."""
1441
+ def _ref_data_from_cuda_interface (self , data : DataType ) -> None :
1442
+ """Reference data from CUDA array interface."""
1442
1443
interface = data .__cuda_array_interface__
1443
1444
interface_str = bytes (json .dumps (interface ), "utf-8" )
1444
1445
_check_call (
1445
1446
_LIB .XGProxyDMatrixSetDataCudaArrayInterface (self .handle , interface_str )
1446
1447
)
1447
1448
1448
- def _set_data_from_cuda_columnar (self , data : DataType , cat_codes : list ) -> None :
1449
- """Set data from CUDA columnar format."""
1449
+ def _ref_data_from_cuda_columnar (self , data : DataType , cat_codes : list ) -> None :
1450
+ """Reference data from CUDA columnar format."""
1450
1451
from .data import _cudf_array_interfaces
1451
1452
1452
1453
interfaces_str = _cudf_array_interfaces (data , cat_codes )
1453
1454
_check_call (_LIB .XGProxyDMatrixSetDataCudaColumnar (self .handle , interfaces_str ))
1454
1455
1455
- def _set_data_from_array (self , data : np .ndarray ) -> None :
1456
- """Set data from numpy array."""
1456
+ def _ref_data_from_array (self , data : np .ndarray ) -> None :
1457
+ """Reference data from numpy array."""
1457
1458
from .data import _array_interface
1458
1459
1459
1460
_check_call (
1460
1461
_LIB .XGProxyDMatrixSetDataDense (self .handle , _array_interface (data ))
1461
1462
)
1462
1463
1463
- def _set_data_from_pandas (self , data : DataType ) -> None :
1464
- """Set data from a pandas DataFrame. The input is a PandasTransformed instance."""
1464
+ def _ref_data_from_pandas (self , data : DataType ) -> None :
1465
+ """Reference data from a pandas DataFrame. The input is a PandasTransformed instance."""
1465
1466
_check_call (
1466
1467
_LIB .XGProxyDMatrixSetDataColumnar (self .handle , data .array_interface ())
1467
1468
)
1468
1469
1469
- def _set_data_from_csr (self , csr : scipy .sparse .csr_matrix ) -> None :
1470
- """Set data from scipy csr"""
1470
+ def _ref_data_from_csr (self , csr : scipy .sparse .csr_matrix ) -> None :
1471
+ """Reference data from scipy csr. """
1471
1472
from .data import _array_interface
1472
1473
1473
1474
_LIB .XGProxyDMatrixSetDataCSR (
@@ -1609,7 +1610,7 @@ def _init(
1609
1610
it = SingleBatchInternalIter (data = data , ** meta )
1610
1611
1611
1612
handle = ctypes .c_void_p ()
1612
- reset_callback , next_callback = it .get_callbacks (True , enable_categorical )
1613
+ reset_callback , next_callback = it .get_callbacks (enable_categorical )
1613
1614
if it .cache_prefix is not None :
1614
1615
raise ValueError (
1615
1616
"QuantileDMatrix doesn't cache data, remove the cache_prefix "
0 commit comments