Skip to content

Commit fede266

Browse files
md-shafiul-alammd.shafiul.alam
andauthored
KMeans OOP (#1770)
* kmeans oop init commit * reformat * reformat * experimental * address ci failures * deselected tests * will be reverted * enable deslected tests * include elkan * address CI failure * address ci failures * enable all deselected tests * deselected tests * compiler update * init signature * deselected tests * format * add sparsity support * lint * minor fix * callable init * lint * table fix * minor * minor * rename attribute * test, revert later * minor * add sparsity * lint * replace basic stat with numpy * remove skip * CI fixes * CI fixes * lint * minor * fix sample_weight * pandas dtype * lint * remove deselected tests * use numpy variance * test sparse offset * revert b51e6bd * remove basic_statistics changes * remove comments * minor * update * update * add result option * refactor for csr * lint * refactor and ci * add version check for oneDAL * update * fix for CI * ci fix * minor * some fixes * ci fixes * lint * add version checks * csr condition for policy * version check for stability check * update test * floating methods * minor * ci fixes * minor * address review * address review * minor * update comments * refactor * ci * address ci * update test * version check * lint * minor fix * lint * basic stat fix * score * minor * ci fix + refactor * more fixes * not a table * minor * sample weight * import * preview remove * SPMD fix * SPMD fix * SPMD fix * refactor * deselect * deselect refactor * deselect update * deselect update * deselect update * deselect * reverting to previous * update daal version * refactor deselected tests * update daal check * address comments * address comments * test fix * address comments * minor * refactor * refactor * refactor * ci fix * ci fix * minor * update checks * import * fix import * refactor * update test * update test * ci fixes * lint * minor * minor * ci fix * fix ci * fix ci * fix ci * fix ci --------- Co-authored-by: md.shafiul.alam <[email protected]>
1 parent 48714b0 commit fede266

File tree

16 files changed

+714
-615
lines changed

16 files changed

+714
-615
lines changed

deselected_tests.yaml

Lines changed: 4 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ deselected_tests:
177177

178178
# test_non_uniform_strategies fails due to differences in handling of vacuous clusters after update
179179
# See https://github.com/IntelPython/daal4py/issues/69
180-
- cluster/tests/test_k_means.py::test_relocated_clusters >=0.23,<0.24
181180
- cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24
182181

183182
# In scikit-learn, these algorithms are not included in this test. However, scikit-learn-intelex
@@ -258,9 +257,6 @@ deselected_tests:
258257
# Different results scikit-learn-intelex and scikit-learn linear regression with weights. Need to investigate.
259258
- inspection/tests/test_permutation_importance.py::test_permutation_importance_sample_weight >=0.24
260259

261-
# Patched and unpatched kmeans set same values to different clusters. Need to investigate.
262-
- preprocessing/tests/test_discretization.py::test_nonuniform_strategies[kmeans-expected_2bins1-expected_3bins1-expected_5bins1] >=0.24
263-
264260
# OOB scores in scikit-learn and oneDAL are different because of different random number generators
265261
- ensemble/tests/test_forest.py::test_forest_classifier_oob[X1-y1-0.65-array-ExtraTreesClassifier]
266262
- ensemble/tests/test_forest.py::test_forest_classifier_oob[True-X1-y1-0.65-array-ExtraTreesClassifier] >=1.3
@@ -362,14 +358,6 @@ deselected_tests:
362358
- tests/test_common.py::test_estimators[LogisticRegression()-check_sample_weights_invariance(kind=zeros)] >=1.4
363359
- tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data >=1.4
364360

365-
# New failing sklearn1.4.1 tests for kmeans associated with incorrect n_iter_ values in daal4py
366-
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-dense] >=1.4
367-
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_matrix] >=1.4
368-
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_array] >=1.4
369-
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-dense] >=1.4
370-
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_matrix] >=1.4
371-
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_array] >=1.4
372-
373361
# Deselected tests for incremental algorithms
374362
# Need to rework getting policy to correctly obtain it for method without data (finalize_fit)
375363
# and avoid keeping it in class attribute, also need to investigate how to implement
@@ -466,16 +454,15 @@ public:
466454
- neighbors/tests/test_neighbors.py::test_KNeighborsClassifier_raise_on_all_zero_weights
467455

468456
# --------------------------------------------------------
469-
# The following tests currently fail with GPU offload
457+
# The following tests currently fail with GPU offloading
470458
gpu:
471-
472459
# Segfaults
473460
- ensemble/tests/test_weight_boosting.py
474-
475461
# Fails
476462
- cluster/tests/test_dbscan.py::test_weighted_dbscan
477-
- cluster/tests/test_k_means.py::test_k_means_fit_predict
478-
- cluster/tests/test_k_means.py::test_predict
463+
- cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal]
464+
- cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs]
465+
- model_selection/tests/test_search.py::test_unsupervised_grid_search
479466

480467
- ensemble/tests/test_bagging.py::test_gridsearch
481468
- ensemble/tests/test_bagging.py::test_estimators_samples
@@ -609,8 +596,6 @@ gpu:
609596
- tests/test_common.py::test_estimators[GaussianMixture()-check_fit_idempotent]
610597
- tests/test_common.py::test_estimators[GaussianMixture()-check_n_features_in]
611598
- tests/test_common.py::test_estimators[GaussianMixture()-check_fit2d_predict1d]
612-
- tests/test_common.py::test_estimators[KMeans()-check_clustering]
613-
- tests/test_common.py::test_estimators[KMeans()-check_clustering(readonly_memmap=True)]
614599
- tests/test_common.py::test_estimators[RandomForestClassifier()-check_class_weight_classifiers]
615600
- tests/test_common.py::test_estimators[SVC()-check_sample_weights_pandas_series]
616601
- tests/test_common.py::test_estimators[SVC()-check_sample_weights_not_an_array]
@@ -645,7 +630,6 @@ gpu:
645630
- tests/test_multiclass.py::test_ovr_coef_
646631
- tests/test_multiclass.py::test_ovr_deprecated_coef_intercept
647632
- tests/test_multiclass.py::test_pairwise_cross_val_score
648-
649633
- tests/test_multioutput.py::test_multiclass_multioutput_estimator_predict_proba
650634
- tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data
651635

@@ -658,25 +642,6 @@ gpu:
658642
- tests/test_common.py::test_search_cv
659643
- manifold/tests/test_t_sne.py::test_n_iter_without_progress
660644

661-
# KMeans based (unsupported for GPU)
662-
- cluster/tests/test_k_means.py
663-
- tests/test_common.py::test_pandas_column_name_consistency[KMeans()]
664-
- tests/test_common.py::test_pandas_column_name_consistency[GaussianMixture()]
665-
- tests/test_common.py::test_pandas_column_name_consistency[BayesianGaussianMixture()]
666-
- tests/test_common.py::test_estimators[KMeans()
667-
- tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted]
668-
- tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted]
669-
- tests/test_common.py::test_check_n_features_in_after_fitting[BayesianGaussianMixture()]
670-
- tests/test_common.py::test_check_n_features_in_after_fitting[GaussianMixture()]
671-
- tests/test_common.py::test_check_n_features_in_after_fitting[KMeans()]
672-
- tests/test_common.py::test_set_output_transform[KMeans()]
673-
- tests/test_common.py::test_set_output_transform_pandas[KMeans()]
674-
- tests/test_common.py::test_global_output_transform_pandas[KMeans()]
675-
- mixture/tests/test_gaussian_mixture.py
676-
- model_selection/tests/test_validation.py::test_cross_val_predict
677-
- metrics/tests/test_score_objects.py::test_supervised_cluster_scorers
678-
- tests/test_pipeline.py::test_fit_predict_on_pipeline
679-
- tests/test_discriminant_analysis.py::test_lda_predict
680645
# Other device issues
681646
- tests/test_metaestimators.py::test_meta_estimators_delegate_data_validation[StackingClassifier]
682647
- tests/test_multiclass.py::test_ovr_always_present
@@ -759,9 +724,3 @@ gpu:
759724
# RuntimeError: Device support is not implemented, failing as result of fallback to cpu false
760725
- svm/tests/test_svm.py::test_unfitted
761726
- tests/test_common.py::test_estimators[SVC()-check_estimators_unfitted]
762-
763-
preview:
764-
- cluster/tests/test_k_means.py::test_kmeans_elkan_results
765-
- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2
766-
- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2
767-
- cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3

onedal/cluster/kmeans.cpp

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ struct method2t {
3838
const auto method = params["method"].cast<std::string>();
3939
ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default);
4040
ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_dense", ops, Float, method::lloyd_dense);
41+
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700
42+
ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_csr", ops, Float, method::lloyd_csr);
43+
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700
4144
ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method);
4245
}
4346

@@ -47,14 +50,10 @@ struct method2t {
4750
template <typename Float, typename Method, typename Task>
4851
struct descriptor_creator {};
4952

50-
template <typename Float>
51-
struct descriptor_creator<Float,
52-
dal::kmeans::method::by_default,
53-
dal::kmeans::task::clustering > {
53+
template <typename Float, typename Method>
54+
struct descriptor_creator<Float, Method, dal::kmeans::task::clustering> {
5455
static auto get() {
55-
return dal::kmeans::descriptor<Float,
56-
dal::kmeans::method::by_default,
57-
dal::kmeans::task::clustering>{};
56+
return dal::kmeans::descriptor<Float, Method, dal::kmeans::task::clustering>{};
5857
}
5958
};
6059

@@ -65,10 +64,15 @@ struct params2desc {
6564

6665
auto desc = descriptor_creator<Float, Method, Task>::get();
6766

68-
desc.set_cluster_count( params["cluster_count"].cast<std::int64_t>() );
69-
desc.set_accuracy_threshold( params["accuracy_threshold"].cast<Float>() );
70-
desc.set_max_iteration_count( params["max_iteration_count"].cast<std::int64_t>() );
71-
67+
desc.set_cluster_count(params["cluster_count"].cast<std::int64_t>());
68+
desc.set_accuracy_threshold(params["accuracy_threshold"].cast<Float>());
69+
desc.set_max_iteration_count(params["max_iteration_count"].cast<std::int64_t>());
70+
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200
71+
auto result_options = params["result_options"].cast<std::string>();
72+
if (result_options == "compute_exact_objective_function") {
73+
desc.set_result_options(result_options::compute_exact_objective_function);
74+
}
75+
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200
7276
return desc;
7377
}
7478
};
@@ -153,7 +157,8 @@ void init_infer_result(py::module_& m) {
153157

154158
auto cls = py::class_<result_t>(m, "infer_result")
155159
.def(py::init())
156-
.DEF_ONEDAL_PY_PROPERTY(responses, result_t);
160+
.DEF_ONEDAL_PY_PROPERTY(responses, result_t)
161+
.DEF_ONEDAL_PY_PROPERTY(objective_function_value, result_t);
157162
}
158163

159164
ONEDAL_PY_DECLARE_INSTANTIATOR(init_model);
@@ -173,10 +178,10 @@ ONEDAL_PY_INIT_MODULE(kmeans) {
173178
auto sub = m.def_submodule("kmeans");
174179

175180
#ifdef ONEDAL_DATA_PARALLEL_SPMD
176-
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
177-
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
178-
ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list);
179-
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
181+
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
182+
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
183+
ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list);
184+
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
180185
#else // ONEDAL_DATA_PARALLEL_SPMD
181186
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list, task_list);
182187
ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list, task_list);

0 commit comments

Comments
 (0)