t-SNE patching (#778)

agorshk · web-flow · commit b6a22344165f · 2022-02-17T12:10:48.000+03:00
diff --git a/daal4py/sklearn/manifold/__init__.py b/daal4py/sklearn/manifold/__init__.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #===============================================================================
-# Copyright 2014 Intel Corporation
+# Copyright 2020 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/daal4py/sklearn/manifold/_t_sne.py b/daal4py/sklearn/manifold/_t_sne.py
@@ -20,6 +20,8 @@
 from time import time
 import numpy as np
 from scipy.sparse import issparse
+import daal4py
+from daal4py.sklearn._utils import daal_check_version, sklearn_check_version
 
 from sklearn.manifold import TSNE as BaseTSNE
 from sklearn.decomposition import PCA
@@ -28,7 +30,6 @@
 from sklearn.utils import check_random_state, check_array
 
 from ..neighbors import NearestNeighbors
-from .._utils import sklearn_check_version
 from .._device_offload import support_usm_ndarray
 
 if sklearn_check_version('0.22'):
@@ -88,6 +89,48 @@ def fit(self, X, y=None):
         """
         return super().fit(X, y)
 
+    def _daal_tsne(self, P, n_samples, X_embedded):
+        """Runs t-SNE."""
+        # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P
+        # and the Student's t-distributions Q. The optimization algorithm that
+        # we use is batch gradient descent with two stages:
+        # * initial optimization with early exaggeration and momentum at 0.5
+        # * final optimization with momentum at 0.8
+
+        # N, nnz, n_iter_without_progress, n_iter
+        size_iter = np.array([[n_samples], [P.nnz], [self.n_iter_without_progress],
+                             [self.n_iter]], dtype=P.dtype)
+        params = np.array([[self.early_exaggeration], [self._learning_rate],
+                          [self.min_grad_norm], [self.angle]], dtype=P.dtype)
+        results = np.zeros((3, 1), dtype=P.dtype)  # curIter, error, gradNorm
+
+        if P.dtype == np.float64:
+            daal4py.daal_tsne_gradient_descent(
+                X_embedded,
+                P,
+                size_iter,
+                params,
+                results,
+                0)
+        elif P.dtype == np.float32:
+            daal4py.daal_tsne_gradient_descent(
+                X_embedded,
+                P,
+                size_iter,
+                params,
+                results,
+                1)
+        else:
+            raise ValueError("unsupported dtype of 'P' matrix")
+
+        # Save the final number of iterations
+        self.n_iter_ = int(results[0][0])
+
+        # Save Kullback-Leiber divergence
+        self.kl_divergence_ = results[1][0]
+
+        return X_embedded
+
     def _fit(self, X, skip_num_points=0):
         """Private function to fit the model using X as training data."""
         if isinstance(self.init, str) and self.init == 'warn':
@@ -293,6 +336,16 @@ def _fit(self, X, skip_num_points=0):
         # Laurens van der Maaten, 2009.
         degrees_of_freedom = max(self.n_components - 1, 1)
 
+        daal_ready = self.method == 'barnes_hut' and self.n_components == 2 and \
+            self.verbose == 0 and daal_check_version((2021, 'P', 600))
+
+        if daal_ready:
+            X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64])
+            return self._daal_tsne(
+                P,
+                n_samples,
+                X_embedded=X_embedded
+            )
         return self._tsne(
             P,
             degrees_of_freedom,
diff --git a/deselected_tests.yaml b/deselected_tests.yaml
@@ -209,6 +209,13 @@ deselected_tests:
   # Some sklearnex docstrings differ from scikit-learn.
   - tests/test_docstrings.py >=1.0.2
 
+  # Accuracy of sklearnex and sklearn may differ due to different approaches
+  - manifold/tests/test_t_sne.py::test_preserve_trustworthiness_approximately_with_precomputed_distances
+  - manifold/tests/test_t_sne.py::test_bh_match_exact
+  - manifold/tests/test_t_sne.py::test_uniform_grid[barnes_hut]
+  - manifold/tests/test_t_sne.py::test_sparse_precomputed_distance
+  - manifold/tests/test_t_sne.py::test_tsne_different_square_distances >=0.24
+
   # Temporary deselected up to 2021.6 release. Need to fix
   - ensemble/tests/test_bagging.py::test_classification
 
diff --git a/generator/wrapper_gen.py b/generator/wrapper_gen.py
@@ -260,6 +260,20 @@ def daal_generate_shuffled_indices(idx, random_state):
     c_generate_shuffled_indices(data_or_file(<PyObject*>idx),
                                 data_or_file(<PyObject*>random_state))
 
+
+cdef extern from "daal4py.h":
+    cdef void c_tsne_gradient_descent(data_or_file & init, data_or_file & p,
+                                      data_or_file & size_iter, data_or_file & params,
+                                      data_or_file & results, char dtype) except +
+
+
+def daal_tsne_gradient_descent(init, p, size_iter, params, results, dtype=0):
+    c_tsne_gradient_descent(data_or_file(<PyObject*>init), data_or_file(<PyObject*>p),
+                            data_or_file(<PyObject*>size_iter),
+                            data_or_file(<PyObject*>params),
+                            data_or_file(<PyObject*>results), dtype)
+
+
 def _execute_with_context(func):
     def exec_func(*args, **keyArgs):
         if 'daal4py.oneapi' in sys.modules:
diff --git a/generator/wrappers.py b/generator/wrappers.py
@@ -41,7 +41,8 @@ def wrap_algo(algo, ver):
                                       'algorithms::classification::training',
                                       'algorithms::tree_utils',
                                       'algorithms::tree_utils::classification',
-                                      'algorithms::tree_utils::regression']):
+                                      'algorithms::tree_utils::regression',
+                                      'algorithms::internal']):
         return False
     # ignore unsupported algos
     if any(x in algo for x in ['quality_metric', '::interface']):
diff --git a/src/daal4py.cpp b/src/daal4py.cpp
@@ -897,3 +897,32 @@ void c_generate_shuffled_indices(data_or_file & idx, data_or_file & random_state
 #else
 #endif
 }
+
+void c_tsne_gradient_descent(data_or_file & init, data_or_file & p, data_or_file & size_iter, data_or_file & params, data_or_file & results, char dtype)
+{
+#if __INTEL_DAAL__ == 2021 && INTEL_DAAL_VERSION >= 20210600
+    auto initTable                                     = get_table(init);
+    auto pTable                                        = get_table(p);
+    auto sizeIterTable                                 = get_table(size_iter);
+    auto paramTable                                    = get_table(params);
+    auto resultTable                                   = get_table(results);
+    daal::data_management::CSRNumericTablePtr csrTable = daal::services::dynamicPointerCast<daal::data_management::CSRNumericTable, daal::data_management::NumericTable>(pTable);
+
+    if (csrTable)
+    {
+        switch (dtype)
+        {
+        case 0:
+            daal::algorithms::internal::tsneGradientDescent<int, double>(initTable, csrTable, sizeIterTable, paramTable, resultTable);
+            break;
+        case 1:
+            daal::algorithms::internal::tsneGradientDescent<int, float>(initTable, csrTable, sizeIterTable, paramTable, resultTable);
+            break;
+        default: throw std::invalid_argument("Invalid data type specified.");
+        }
+    }
+    else
+        PyErr_SetString(PyExc_RuntimeError, "Unexpected table type");
+#else
+#endif
+}
diff --git a/src/daal4py.h b/src/daal4py.h
@@ -55,6 +55,9 @@ using daal::services::LibraryVersionInfo;
 #if __INTEL_DAAL__ == 2021 && INTEL_DAAL_VERSION >= 20210200
     #include "data_management/data/internal/roc_auc_score.h"
 #endif
+#if __INTEL_DAAL__ == 2021 && INTEL_DAAL_VERSION >= 20210600
+    #include "algorithms/tsne/tsne_gradient_descent.h"
+#endif
 
 
 extern "C" {
@@ -342,4 +345,10 @@ extern "C" {
 void c_generate_shuffled_indices(data_or_file & idx, data_or_file & random_state);
 }
 
+extern "C"
+{
+    void c_tsne_gradient_descent(data_or_file & init, data_or_file & p, data_or_file & size_iter,
+                                 data_or_file & params, data_or_file & results, char dtype);
+}
+
 #endif // _HLAPI_H_INCLUDED_

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`#!/usr/bin/env python`
`2`	`2`	`#===============================================================================`
`3`		`-# Copyright 2014 Intel Corporation`
	`3`	`+# Copyright 2020 Intel Corporation`
`4`	`4`	`#`
`5`	`5`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`6`	`6`	`# you may not use this file except in compliance with the License.`