Add SPMD interfaces for LogisticRegression (#1673)

avolkov-intel · web-flow · commit 130d06880b7f · 2024-01-30T18:15:44.000+01:00
* Initial commit

* Add requirements for logistic_regression_spmd, update version requirements

* Substitute BaseLogisticRegressionSPMD with BaseEstimatorSPMD

* Remove redundant import
diff --git a/examples/sklearnex/logistic_regression_spmd.py b/examples/sklearnex/logistic_regression_spmd.py
@@ -0,0 +1,91 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from warnings import warn
+
+import dpctl
+import dpctl.tensor as dpt
+import numpy as np
+from mpi4py import MPI
+from scipy.special import expit
+from sklearn.datasets import make_classification
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+
+from sklearnex.spmd.linear_model import LogisticRegression
+
+
+def generate_X_y(par, seed):
+    np.random.seed()
+    ns, nf = par["ns"], par["nf"]
+
+    assert nf > 2
+    # 2 last features will be redundant, weights are same for all ranks
+    np.random.seed(42)
+    intercept = np.random.normal(0, 5)
+    weights = np.hstack([np.random.normal(0, 4, nf - 2), np.zeros(2)])
+
+    np.random.seed(seed)
+    X = np.random.normal(0, 3, (ns, nf))
+    noise = np.random.normal(0, 4, ns)
+    y = expit(X @ weights + noise + intercept) >= 0.5
+    y = y.astype(np.int32)
+    return X, y
+
+
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+size = comm.Get_size()
+
+if dpctl.has_gpu_devices:
+    q = dpctl.SyclQueue("gpu")
+else:
+    raise RuntimeError(
+        "GPU devices unavailable. Currently, "
+        "SPMD execution mode is implemented only for this device type."
+    )
+
+params = {"ns": 100000, "nf": 8}
+
+X, y = generate_X_y(params, rank)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=rank
+)
+
+dpt_X_train = dpt.asarray(X_train, usm_type="device", sycl_queue=q)
+dpt_y_train = dpt.asarray(y_train, usm_type="device", sycl_queue=q)
+dpt_X_test = dpt.asarray(X_test, usm_type="device", sycl_queue=q)
+dpt_y_test = dpt.asarray(y_test, usm_type="device", sycl_queue=q)
+
+model_spmd = LogisticRegression()
+model_spmd.fit(dpt_X_train, dpt_y_train)
+
+y_predict = model_spmd.predict(dpt_X_test)
+
+print("Distributed LogisticRegression results:")
+print("Coeficients on rank {}:\n{}:".format(rank, model_spmd.coef_))
+print("Intercept on rank {}:\n{}:".format(rank, model_spmd.intercept_))
+print("Ground truth (first 5 observations on rank {}):\n{}".format(rank, y_test[:5]))
+print(
+    "Classification results (first 5 observations on rank {}):\n{}".format(
+        rank, dpt.to_numpy(y_predict)[:5]
+    )
+)
+print(
+    "Accuracy for entire rank {} (2 classes): {}\n".format(
+        rank, accuracy_score(y_test, dpt.to_numpy(y_predict))
+    )
+)
diff --git a/onedal/linear_model/logistic_regression.cpp b/onedal/linear_model/logistic_regression.cpp
@@ -241,8 +241,13 @@ ONEDAL_PY_INIT_MODULE(logistic_regression) {
     auto sub = m.def_submodule("logistic_regression");
 
 
+#if defined(ONEDAL_DATA_PARALLEL_SPMD) && defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240100
+    ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list_spmd, task_list);
+    ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list_spmd, task_list);
+#else
     ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list, task_list);
     ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list, task_list);
+#endif // defined(ONEDAL_DATA_PARALLEL_SPMD) && defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240100
 
     ONEDAL_PY_INSTANTIATE(init_model, sub, task_list);
     ONEDAL_PY_INSTANTIATE(init_train_result, sub, task_list);
diff --git a/onedal/spmd/linear_model/__init__.py b/onedal/spmd/linear_model/__init__.py
@@ -15,5 +15,6 @@
 # ==============================================================================
 
 from .linear_model import LinearRegression
+from .logistic_regression import LogisticRegression
 
-__all__ = ["LinearRegression"]
+__all__ = ["LinearRegression", "LogisticRegression"]
diff --git a/onedal/spmd/linear_model/logistic_regression.py b/onedal/spmd/linear_model/logistic_regression.py
@@ -0,0 +1,38 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from onedal.linear_model import LogisticRegression as LogisticRegression_Batch
+
+from ..._device_offload import support_usm_ndarray
+from .._common import BaseEstimatorSPMD
+
+
+class LogisticRegression(BaseEstimatorSPMD, LogisticRegression_Batch):
+    @support_usm_ndarray()
+    def fit(self, X, y, queue=None):
+        return super().fit(X, y, queue)
+
+    @support_usm_ndarray()
+    def predict(self, X, queue=None):
+        return super().predict(X, queue)
+
+    @support_usm_ndarray()
+    def predict_proba(self, X, queue=None):
+        return super().predict_proba(X, queue)
+
+    @support_usm_ndarray()
+    def predict_log_proba(self, X, queue=None):
+        return super().predict_log_proba(X, queue)
diff --git a/sklearnex/spmd/linear_model/__init__.py b/sklearnex/spmd/linear_model/__init__.py
@@ -15,5 +15,6 @@
 # ==============================================================================
 
 from .linear_model import LinearRegression
+from .logistic_regression import LogisticRegression
 
-__all__ = ["LinearRegression"]
+__all__ = ["LinearRegression", "LogisticRegression"]
diff --git a/sklearnex/spmd/linear_model/logistic_regression.py b/sklearnex/spmd/linear_model/logistic_regression.py
@@ -0,0 +1,21 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from onedal.spmd.linear_model import LogisticRegression
+
+# TODO:
+# Currently it uses `onedal` module interface.
+# Add sklearnex dispatching.
diff --git a/tests/run_examples.py b/tests/run_examples.py
@@ -144,6 +144,7 @@ def check_library(rule):
 req_version["knn_bf_classification_spmd.py"] = (2023, "P", 100)
 req_version["knn_bf_regression_spmd.py"] = (2023, "P", 100)
 req_version["linear_regression_spmd.py"] = (2023, "P", 100)
+req_version["logistic_regression_spmd.py"] = (2024, "P", 100)
 
 req_device = defaultdict(lambda: [])
 req_device["basic_statistics_spmd.py"] = ["gpu"]
@@ -153,6 +154,7 @@ def check_library(rule):
 req_device["knn_bf_classification_spmd.py"] = ["gpu"]
 req_device["knn_bf_regression_spmd.py"] = ["gpu"]
 req_device["linear_regression_spmd.py"] = ["gpu"]
+req_device["logistic_regression_spmd.py"] = ["gpu"]
 req_device["pca_spmd.py"] = ["gpu"]
 req_device["random_forest_classifier_dpctl.py"] = ["gpu"]
 req_device["random_forest_classifier_spmd.py"] = ["gpu"]
@@ -169,6 +171,7 @@ def check_library(rule):
 req_library["knn_bf_classification_spmd.py"] = ["dpctl", "mpi4py"]
 req_library["knn_bf_regression_spmd.py"] = ["dpctl", "mpi4py"]
 req_library["linear_regression_spmd.py"] = ["dpctl", "mpi4py"]
+req_library["logistic_regression_spmd.py"] = ["dpctl", "mpi4py"]
 req_library["pca_spmd.py"] = ["dpctl", "mpi4py"]
 req_library["random_forest_classifier_dpctl.py"] = ["dpctl"]
 req_library["random_forest_classifier_spmd.py"] = ["dpctl", "mpi4py"]