Skip to content

Commit df270a3

Browse files
authored
Fix OOB scores in preview Random Forest; preview algorithms fixes (#1209)
1 parent e5dd28b commit df270a3

File tree

7 files changed

+123
-33
lines changed

7 files changed

+123
-33
lines changed

onedal/datatypes/_data_conversion.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,14 @@
1414
# limitations under the License.
1515
# ===============================================================================
1616

17+
import numpy as np
1718
import warnings
1819
import numpy as np
1920

2021
from onedal import _is_dpc_backend
2122
from onedal import _backend
2223
from daal4py.sklearn._utils import make2d
24+
from onedal import _is_dpc_backend
2325

2426
try:
2527
import dpctl

onedal/decomposition/pca.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ def fit(self, X, y, queue):
4949

5050
policy = _get_policy(queue, X, y)
5151

52+
# TODO: investigate why np.ndarray with OWNDATA=FALSE flag
53+
# fails to be converted to oneDAL table
54+
if isinstance(X, np.ndarray) and not X.flags['OWNDATA']:
55+
X = X.copy()
5256
X, y = _convert_to_supported(policy, X, y)
5357
params = self.get_onedal_params(X)
5458
cov_result = _backend.covariance.compute(

onedal/ensemble/forest.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "oneapi/dal/algo/decision_forest.hpp"
1818

1919
#include "onedal/common.hpp"
20+
#include "onedal/version.hpp"
2021

2122
namespace py = pybind11;
2223

@@ -73,6 +74,16 @@ auto get_error_metric_mode(const py::dict& params) {
7374
result_mode |= error_metric_mode::out_of_bag_error;
7475
else if (modes[i] == "out_of_bag_error_per_observation")
7576
result_mode |= error_metric_mode::out_of_bag_error_per_observation;
77+
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230101
78+
else if (modes[i] == "out_of_bag_error_accuracy")
79+
result_mode |= error_metric_mode::out_of_bag_error_accuracy;
80+
else if (modes[i] == "out_of_bag_error_r2")
81+
result_mode |= error_metric_mode::out_of_bag_error_r2;
82+
else if (modes[i] == "out_of_bag_error_decision_function")
83+
result_mode |= error_metric_mode::out_of_bag_error_decision_function;
84+
else if (modes[i] == "out_of_bag_error_prediction")
85+
result_mode |= error_metric_mode::out_of_bag_error_prediction;
86+
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20230101
7687
else
7788
ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(mode);
7889
}
@@ -238,6 +249,12 @@ void init_train_result(py::module_& m) {
238249
.DEF_ONEDAL_PY_PROPERTY(model, result_t)
239250
.DEF_ONEDAL_PY_PROPERTY(oob_err, result_t)
240251
.DEF_ONEDAL_PY_PROPERTY(oob_err_per_observation, result_t)
252+
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230101
253+
.DEF_ONEDAL_PY_PROPERTY(oob_err_accuracy, result_t)
254+
.DEF_ONEDAL_PY_PROPERTY(oob_err_r2, result_t)
255+
.DEF_ONEDAL_PY_PROPERTY(oob_err_decision_function, result_t)
256+
.DEF_ONEDAL_PY_PROPERTY(oob_err_prediction, result_t)
257+
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20230101
241258
.DEF_ONEDAL_PY_PROPERTY(var_importance, result_t);
242259
}
243260

onedal/ensemble/forest.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ def _validate_targets(self, y, dtype):
302302
return _column_or_1d(y, warn=True).astype(dtype, copy=False)
303303

304304
def _get_sample_weight(self, X, y, sample_weight):
305-
n_samples, _ = X.shape
305+
n_samples = X.shape[0]
306306
dtype = X.dtype
307307
if n_samples == 1:
308308
raise ValueError("n_samples=1")
@@ -359,9 +359,14 @@ def _fit(self, X, y, sample_weight, module, queue):
359359
self._onedal_model = train_result.model
360360

361361
if self.oob_score:
362-
self.oob_score_ = from_table(train_result.oob_err)[0, 0]
363-
self.oob_prediction_ = from_table(
364-
train_result.oob_err_per_observation)
362+
if self.is_classification:
363+
self.oob_score_ = from_table(train_result.oob_err_accuracy)[0, 0]
364+
self.oob_prediction_ = from_table(
365+
train_result.oob_err_decision_function)
366+
else:
367+
self.oob_score_ = from_table(train_result.oob_err_r2)[0, 0]
368+
self.oob_prediction_ = from_table(
369+
train_result.oob_err_prediction).reshape(-1)
365370
if np.any(self.oob_prediction_ == 0):
366371
warnings.warn(
367372
"Some inputs do not have OOB scores. This probably means "

sklearnex/preview/decomposition/pca.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from sklearn.utils.validation import check_array
2929
from sklearn.base import BaseEstimator
3030
from sklearn.utils.validation import check_is_fitted
31+
if sklearn_check_version('1.1') and not sklearn_check_version('1.2'):
32+
from sklearn.utils import check_scalar
3133
if sklearn_check_version('0.23'):
3234
from sklearn.decomposition._pca import _infer_dimension
3335
else:
@@ -38,6 +40,9 @@
3840

3941

4042
class PCA(sklearn_PCA):
43+
if sklearn_check_version('1.2'):
44+
_parameter_constraints: dict = {**sklearn_PCA._parameter_constraints}
45+
4146
def __init__(
4247
self,
4348
n_components=None,
@@ -83,6 +88,15 @@ def _validate_n_components(self, n_components, n_samples,
8388
% (n_components, type(n_components)))
8489

8590
def fit(self, X, y=None):
91+
if sklearn_check_version('1.2'):
92+
self._validate_params()
93+
elif sklearn_check_version('1.1'):
94+
check_scalar(
95+
self.n_oversamples,
96+
"n_oversamples",
97+
min_val=1,
98+
target_type=numbers.Integral,
99+
)
86100
self._fit(X)
87101
return self
88102

@@ -93,12 +107,12 @@ def _fit(self, X):
93107
"TruncatedSVD for a possible alternative."
94108
)
95109

96-
X = _check_array(
97-
X,
98-
dtype=[np.float64, np.float32],
99-
ensure_2d=True,
100-
copy=False
101-
)
110+
if sklearn_check_version('0.23'):
111+
X = self._validate_data(X, dtype=[np.float64, np.float32],
112+
ensure_2d=True, copy=False)
113+
else:
114+
X = _check_array(X, dtype=[np.float64, np.float32],
115+
ensure_2d=True, copy=False)
102116

103117
n_samples, n_features = X.shape
104118
n_sf_min = min(n_samples, n_features)

sklearnex/preview/ensemble/forest.py

Lines changed: 55 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@
3939
from sklearn.utils.validation import (
4040
check_is_fitted,
4141
check_consistent_length,
42-
check_array)
42+
check_array,
43+
check_X_y)
4344

4445
from onedal.datatypes import _check_array, _num_features, _num_samples
4546

@@ -339,8 +340,6 @@ def _onedal_ready(self, X, y, sample_weight):
339340
self._validate_params()
340341
else:
341342
self._check_parameters()
342-
if sample_weight is not None:
343-
sample_weight = self.check_sample_weight(sample_weight, X)
344343

345344
correct_sparsity = not sp.issparse(X)
346345
correct_ccp_alpha = self.ccp_alpha == 0.0
@@ -526,6 +525,8 @@ def _onedal_cpu_supported(self, method_name, *data):
526525
ready, X, y, sample_weight = self._onedal_ready(*data)
527526
if not ready:
528527
return False
528+
elif sp.issparse(X):
529+
return False
529530
elif sp.issparse(y):
530531
return False
531532
elif sp.issparse(sample_weight):
@@ -534,6 +535,8 @@ def _onedal_cpu_supported(self, method_name, *data):
534535
return False
535536
elif self.warm_start:
536537
return False
538+
elif self.oob_score and not daal_check_version((2023, 'P', 101)):
539+
return False
537540
elif not self.n_outputs_ == 1:
538541
return False
539542
elif hasattr(self, 'estimators_'):
@@ -563,14 +566,20 @@ def _onedal_gpu_supported(self, method_name, *data):
563566
ready, X, y, sample_weight = self._onedal_ready(*data)
564567
if not ready:
565568
return False
569+
elif sp.issparse(X):
570+
return False
566571
elif sp.issparse(y):
567572
return False
573+
elif sp.issparse(sample_weight):
574+
return False
568575
elif not sample_weight: # `sample_weight` is not supported.
569576
return False
570577
elif not self.ccp_alpha == 0.0:
571578
return False
572579
elif self.warm_start:
573580
return False
581+
elif self.oob_score:
582+
return False
574583
elif not self.n_outputs_ == 1:
575584
return False
576585
elif hasattr(self, 'estimators_'):
@@ -596,9 +605,33 @@ def _onedal_gpu_supported(self, method_name, *data):
596605
f'Unknown method {method_name} in {self.__class__.__name__}')
597606

598607
def _onedal_fit(self, X, y, sample_weight=None, queue=None):
599-
X, y = make2d(np.asarray(X)), make2d(np.asarray(y))
608+
if sklearn_check_version('1.2'):
609+
X, y = self._validate_data(
610+
X, y, multi_output=False, accept_sparse=False,
611+
dtype=[np.float64, np.float32]
612+
)
613+
else:
614+
X, y = check_X_y(
615+
X, y, accept_sparse=False, dtype=[np.float64, np.float32],
616+
multi_output=False
617+
)
600618

601-
y = check_array(y, ensure_2d=False)
619+
if sample_weight is not None:
620+
sample_weight = self.check_sample_weight(sample_weight, X)
621+
622+
y = np.atleast_1d(y)
623+
if y.ndim == 2 and y.shape[1] == 1:
624+
warnings.warn(
625+
"A column-vector y was passed when a 1d array was"
626+
" expected. Please change the shape of y to "
627+
"(n_samples,), for example using ravel().",
628+
DataConversionWarning,
629+
stacklevel=2,
630+
)
631+
if y.ndim == 1:
632+
# reshape is necessary to preserve the data contiguity against vs
633+
# [:, np.newaxis] that does not.
634+
y = np.reshape(y, (-1, 1))
602635

603636
y, expanded_class_weight = self._validate_y_class_weight(y)
604637

@@ -620,7 +653,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
620653
"Training data only contain information about one class.")
621654

622655
if self.oob_score:
623-
err = 'out_of_bag_error|out_of_bag_error_per_observation'
656+
err = 'out_of_bag_error_accuracy|out_of_bag_error_decision_function'
624657
else:
625658
err = 'none'
626659

@@ -664,35 +697,35 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
664697
return self
665698

666699
def _onedal_predict(self, X, queue=None):
700+
X = check_array(X, dtype=[np.float32, np.float64])
701+
check_is_fitted(self)
667702
if sklearn_check_version("1.0"):
668703
self._check_feature_names(X, reset=False)
669-
X = check_array(
670-
X,
671-
accept_sparse=False, # is not supported
672-
dtype=[np.float64, np.float32]
673-
)
674704

675705
res = self._onedal_estimator.predict(X, queue=queue)
676706
return np.take(self.classes_,
677707
res.ravel().astype(np.int64, casting='unsafe'))
678708

679709
def _onedal_predict_proba(self, X, queue=None):
710+
X = check_array(X, dtype=[np.float64, np.float32])
680711
check_is_fitted(self)
681712
if sklearn_check_version('0.23'):
682713
self._check_n_features(X, reset=False)
683714
if sklearn_check_version("1.0"):
684715
self._check_feature_names(X, reset=False)
685-
X = check_array(
686-
X,
687-
accept_sparse=False, # is not supported
688-
dtype=[np.float64, np.float32]
689-
)
690716
return self._onedal_estimator.predict_proba(X, queue=queue)
691717

692718

693719
class RandomForestRegressor(sklearn_RandomForestRegressor, BaseRandomForest):
694720
__doc__ = sklearn_RandomForestRegressor.__doc__
695721

722+
if sklearn_check_version('1.2'):
723+
_parameter_constraints: dict = {
724+
**sklearn_RandomForestRegressor._parameter_constraints,
725+
"max_bins": [Interval(numbers.Integral, 2, None, closed="left")],
726+
"min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")]
727+
}
728+
696729
if sklearn_check_version('1.0'):
697730
def __init__(
698731
self,
@@ -862,6 +895,8 @@ def _onedal_cpu_supported(self, method_name, *data):
862895
return False
863896
elif self.warm_start:
864897
return False
898+
elif self.oob_score and not daal_check_version((2023, 'P', 101)):
899+
return False
865900
elif not self.n_outputs_ == 1:
866901
return False
867902
elif hasattr(self, 'estimators_'):
@@ -903,6 +938,8 @@ def _onedal_gpu_supported(self, method_name, *data):
903938
return False
904939
elif self.warm_start:
905940
return False
941+
elif self.oob_score:
942+
return False
906943
elif not self.n_outputs_ == 1:
907944
return False
908945
elif hasattr(self, 'estimators_'):
@@ -949,7 +986,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
949986
rs_ = check_random_state(self.random_state)
950987

951988
if self.oob_score:
952-
err = 'out_of_bag_error|out_of_bag_error_per_observation'
989+
err = 'out_of_bag_error_r2|out_of_bag_error_prediction'
953990
else:
954991
err = 'none'
955992

@@ -986,11 +1023,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None):
9861023
def _onedal_predict(self, X, queue=None):
9871024
if sklearn_check_version("1.0"):
9881025
self._check_feature_names(X, reset=False)
989-
X = check_array(
990-
X,
991-
accept_sparse=False,
992-
dtype=[np.float64, np.float32]
993-
)
1026+
X = self._validate_X_predict(X)
9941027
return self._onedal_estimator.predict(X, queue=queue)
9951028

9961029
@wrap_output_data

sklearnex/preview/linear_model/linear.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
if sklearn_check_version('1.0') and not sklearn_check_version('1.2'):
3131
from sklearn.linear_model._base import _deprecate_normalize
3232

33-
from sklearn.utils.validation import _deprecate_positional_args
33+
from sklearn.utils.validation import _deprecate_positional_args, check_X_y
3434
from sklearn.exceptions import NotFittedError
3535
from scipy.sparse import issparse
3636

@@ -233,6 +233,20 @@ def _initialize_onedal_estimator(self):
233233
def _onedal_fit(self, X, y, sample_weight, queue=None):
234234
assert sample_weight is None
235235

236+
check_params = {
237+
'X': X,
238+
'y': y,
239+
'dtype': [np.float64, np.float32],
240+
'accept_sparse': ['csr', 'csc', 'coo'],
241+
'y_numeric': True,
242+
'multi_output': True,
243+
'force_all_finite': False
244+
}
245+
if sklearn_check_version('1.2'):
246+
X, y = self._validate_data(**check_params)
247+
else:
248+
X, y = check_X_y(**check_params)
249+
236250
if sklearn_check_version(
237251
'1.0') and not sklearn_check_version('1.2'):
238252
self._normalize = _deprecate_normalize(
@@ -247,6 +261,7 @@ def _onedal_fit(self, X, y, sample_weight, queue=None):
247261
self._save_attributes()
248262

249263
def _onedal_predict(self, X, queue=None):
264+
X = self._validate_data(X, accept_sparse=False, reset=False)
250265
if not hasattr(self, '_onedal_estimator'):
251266
self._initialize_onedal_estimator()
252267
self._onedal_estimator.coef_ = self.coef_

0 commit comments

Comments
 (0)