scikit-learn-contrib · MatthewSZhang · Oct 17, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/README.rst b/README.rst
@@ -57,10 +57,27 @@ Or via conda-forge:
 Getting Started
 ---------------
 >>> from fastcan import FastCan
->>> X = [[1, 0], [0, 1]]
->>> y = [1, 0]
->>> FastCan(verbose=0).fit(X, y).get_support()
-array([ True, False])
+>>> X = [[ 0.87, -1.34,  0.31 ],
+...     [-2.79, -0.02, -0.85 ],
+...     [-1.34, -0.48, -2.55 ],
+...     [ 1.92,  1.48,  0.65 ]]
+>>> y = [0, 1, 0, 1]
+>>> selector = FastCan(n_features_to_select=2, verbose=0).fit(X, y)
+>>> selector.get_support()
+array([ True,  True, False])
+>>> selector.get_support(indices=True) # Sorted indices
+array([0, 1])
+>>> selector.indices_ # Indices in selection order
+array([1, 0], dtype=int32)
+>>> selector.scores_ # Scores for selected features in selection order
+array([0.64276838, 0.09498243])
+>>> # Here Feature 2 must be included
+>>> selector = FastCan(n_features_to_select=2, indices_include=[2], verbose=0).fit(X, y)
+>>> # We can find the feature which is useful when working with Feature 2 
+>>> selector.indices_
+array([2, 1], dtype=int32)
+>>> selector.scores_
+array([0.16632562, 0.50544788])
 
 
 Citation

diff --git a/fastcan/_cancorr_fast.pyx b/fastcan/_cancorr_fast.pyx
@@ -12,15 +12,15 @@ from sklearn.utils._typedefs cimport int32_t
 
 
 @final
-cdef unsigned int _bsum(
-    bint* x,
-    unsigned int n,
+cdef int _bsum(
+    const bint* x,
+    int n,
 ) noexcept nogil:
     """Computes the sum of the vector of bool elements.
     """
     cdef:
-        unsigned int total = 0
-        unsigned int i
+        int total = 0
+        int i
     for i in range(n):
         total += x[i]
     return total
@@ -29,7 +29,7 @@ cdef unsigned int _bsum(
 @final
 cdef int _iamax(
     int n,
-    const floating *x,
+    const floating* x,
     int incx,
 ) noexcept nogil:
     """
@@ -45,114 +45,90 @@ cdef int _iamax(
 
 @final
 cdef bint _normv(
-    floating[::1] x,            # IN/OUT
+    const floating* x,          # IN/OUT
+    int n_samples,              # IN
 ) noexcept nogil:
     """
     Vector normalization by Euclidean norm.
     x (IN) : (n_samples,) Vector.
     x (OUT) : (n_samples,) Normalized vector.
+    n_samples (IN) : number of samples.
     Return: Mask the constant vector.
     """
     cdef:
-        unsigned int n_samples = x.shape[0]
         floating x_norm
 
-    x_norm = _nrm2(n_samples, &x[0], 1)
+    x_norm = _nrm2(n_samples, x, 1)
     if x_norm == 0.0:
         return True
     x_norm = 1.0/x_norm
-    _scal(n_samples, x_norm, &x[0], 1)
+    _scal(n_samples, x_norm, x, 1)
     return False
 
 
-@final
-cdef void _normm(
-    floating[::1, :] X,     # IN/OUT
-    bint* m,            # IN/OUT
-) noexcept nogil:
-    """
-    Matrix column-wise normalization by Euclidean norm.
-    X (IN) : (n_samples, nx) Matrix.
-    X (OUT) : (n_samples, nx) Column-wise normalized matrix.
-    m (IN): (n_features,) Mask contains only false.
-    m (OUT): (n_features,) Mask the constant vectors.
-    """
-    cdef:
-        unsigned int n_samples = X.shape[0]
-        unsigned int nx = X.shape[1]
-        floating x_norm
-        unsigned int j
-
-    # X = X/norm(X)
-    for j in range(nx):
-        x_norm = _nrm2(n_samples, &X[0, j], 1)
-        if x_norm == 0.0:
-            m[j] = True
-        else:
-            x_norm = 1.0/x_norm
-            _scal(n_samples, x_norm, &X[0, j], 1)
-
-
 @final
 cdef floating _sscvm(
-    const floating[::1] w,      # IN
-    const floating[::1, :] V,   # IN
+    const floating* w,          # IN
+    const floating* V,          # IN
+    int n_samples,              # IN
+    int n_targets,              # IN
 ) noexcept nogil:
     """
     Sum of squared correlation coefficients.
     w : (n_samples,) Centred orthogonalized feature vector.
-    V : (n_samples, nv) Centred orthogonalized target matrix.
+    V : (n_samples, n_targets) Centred orthogonalized target matrix.
+    n_samples (IN) : number of samples.
+    n_targets (IN) : column number of V
     r2 : (nw,) Sum of squared correlation coefficients, where r2i means the
         coefficient of determination between wi and V.
     """
     cdef:
-        unsigned int n_samples = V.shape[0]
-        unsigned int nv = V.shape[1]
         # R : (nw * nv) R**2 contains the pairwise h-correlation or eta-cosine, where
         #     rij means the h-correlation or eta-cosine between wi and vj.
-        floating* r = <floating*> malloc(sizeof(floating) * nv)
+        floating* r = <floating*> malloc(sizeof(floating) * n_targets)
         floating r2
 
     # r = w*V (w is treated as (1, n_samples))
-    _gemm(ColMajor, NoTrans, NoTrans, 1, nv, n_samples, 1.0,
-          &w[0], 1, &V[0, 0], n_samples, 0.0, r, 1)
+    _gemm(ColMajor, NoTrans, NoTrans, 1, n_targets, n_samples, 1.0,
+          w, 1, V, n_samples, 0.0, r, 1)
     # r2 = r*r.T
 
-    r2 = _dot(nv, r, 1, r, 1)
+    r2 = _dot(n_targets, r, 1, r, 1)
 
     free(r)
     return r2
 
 
 @final
 cdef void _mgsvv(
-    const floating[::1] w,      # IN
-    floating[::1] x,            # IN/OUT
+    const floating* w,          # IN
+    const floating* x,          # IN/OUT
+    int n_samples,     # IN
 ) noexcept nogil:
     """
     Modified Gram-Schmidt process. x = x - w*w.T*x
     w : (n_samples,) Centred orthonormal selected feature vector.
     x (IN) : (n_samples,) Centred remaining feature vector.
     x (OUT) : (n_samples,) Centred remaining feature vector, which is orthogonal to w.
+    n_samples (IN) : number of samples.
     """
     cdef:
-        unsigned int n_samples = x.shape[0]
         floating r
 
     # r = w.T*x
-    r = _dot(n_samples, &w[0], 1, &x[0], 1)
+    r = _dot(n_samples, w, 1, x, 1)
     # x = x - w*r
-    _axpy(n_samples, -r, &w[0], 1, &x[0], 1)
+    _axpy(n_samples, -r, w, 1, x, 1)
 
 
 @final
 cpdef int _forward_search(
     floating[::1, :] X,               # IN/OUT
     floating[::1, :] V,               # IN
-    const unsigned int t,             # IN
-    const floating tol,               # IN
-    const unsigned int num_threads,   # IN
-    const unsigned int verbose,       # IN
+    int t,                            # IN
+    floating tol,                     # IN
+    int num_threads,                  # IN
+    int verbose,                      # IN
     int32_t[::1] indices,             # OUT
     floating[::1] scores,             # OUT
 ) except -1 nogil:
@@ -168,13 +144,14 @@ cpdef int _forward_search(
     scores: (t,) The h-correlation/eta-cosine of selected features.
     """
     cdef:
-        unsigned int n_samples = X.shape[0]
+        int n_samples = X.shape[0]
+        int n_targets = V.shape[1]
         # OpenMP (in Windows) requires signed integral for prange
-        int j, n_features = X.shape[1]
+        int n_features = X.shape[1]
         floating* r2 = <floating*> malloc(sizeof(floating) * n_features)
         bint* mask = <bint*> malloc(sizeof(bint) * n_features)
         floating g, ssc = 0.0
-        unsigned int i
+        int i, j
         int index = -1
 
     memset(&r2[0], 0, n_features * sizeof(floating))
@@ -183,38 +160,39 @@ cpdef int _forward_search(
     for i in range(t):
         if i == 0:
             # Preprocessing
-            _normm(X, mask)
+            for j in range(n_features):
+                mask[j] = _normv(&X[0, j], n_samples)
         else:
             mask[index] = True
             r2[index] = 0
             # Make X orthogonal to X[:, indices[i-1]]
             for j in prange(n_features, nogil=True, schedule="static",
                             chunksize=1, num_threads=num_threads):
                 if not mask[j]:
-                    _mgsvv(X[:, index], X[:, j])
-                    _normv(X[:, j])
+                    _mgsvv(&X[0, index], &X[0, j], n_samples)
+                    _normv(&X[0, j], n_samples)
                     # Linear dependence check
                     g = _dot(n_samples, &X[0, index], 1, &X[0, j], 1)
                     if abs(g) > tol:
                         mask[j] = True
                         r2[j] = 0
 
-            if _bsum(mask, n_features) == n_features:
+            if _bsum(&mask[0], n_features) == n_features:
                 raise RuntimeError(
                     "No candidate feature can be found to form a non-singular "
                     f"matrix with the {i} selected features."
                 )
         if indices[i] != -1:
             index = indices[i]
-            scores[i] = _sscvm(X[:, index], V)
+            scores[i] = _sscvm(&X[0, index], &V[0, 0], n_samples, n_targets)
         else:
             # Score for X
             for j in range(n_features):
                 if not mask[j]:
-                    r2[j] = _sscvm(X[:, j], V)
+                    r2[j] = _sscvm(&X[0, j], &V[0, 0], n_samples, n_targets)
 
             # Find max scores and update indices, X, mask, and scores
-            index = _iamax(n_features, r2, 1)
+            index = _iamax(n_features, &r2[0], 1)
             indices[i] = index
             scores[i] = r2[index]
 
@@ -223,6 +201,9 @@ cpdef int _forward_search(
             with gil:
                 print(f"Progress: {i+1}/{t}, SSC: {ssc:.5f}", end="\r")
 
+    if verbose == 1:
+        with gil:
+            print()
     free(r2)
     free(mask)
     return 0
diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py
@@ -77,14 +77,10 @@ class FastCan(SelectorMixin, BaseEstimator):
     Examples
     --------
     >>> from fastcan import FastCan
-    >>> X = [[ 0.87, -1.34,  0.31 ],
-    ...     [-2.79, -0.02, -0.85 ],
-    ...     [-1.34, -0.48, -2.55 ],
-    ...     [ 1.92,  1.48,  0.65 ]]
-    >>> y = [0, 1, 0, 1]
-    >>> selector = FastCan(n_features_to_select=2, verbose=0).fit(X, y)
-    >>> selector.get_support()
-    array([ True,  True, False])
+    >>> X = [[1, 0], [0, 1]]
+    >>> y = [1, 0]
+    >>> FastCan(verbose=0).fit(X, y).get_support()
+    array([ True, False])
     """
 
     _parameter_constraints: dict = {