scikit-learn-contrib
diff --git a/‎qolmat/imputations/mimca/imputer_mca.py‎
Lines changed: 7 additions & 212 deletions b/‎qolmat/imputations/mimca/imputer_mca.py‎
Lines changed: 7 additions & 212 deletions
diff --git a/‎qolmat/utils/algebra.py‎
Lines changed: 37 additions & 68 deletions b/‎qolmat/utils/algebra.py‎
Lines changed: 37 additions & 68 deletions
@@ -1,218 +1,13 @@
 import numpy as np  # noqa: D100
 import pandas as pd
 
-
-def moy_p(V, weights):
-    """Compute the weighted mean of a vector, ignoring NaNs.
-
-    Parameters
-    ----------
-    V : array-like
-        Input vector with possible NaN values.
-    weights : array-like
-        Weights corresponding to each element in V.
-
-    Returns
-    -------
-    float
-        Weighted mean of non-NaN elements.
-
-    """
-    mask = ~np.isnan(V)
-    total_weight = np.sum(weights[mask])
-    if total_weight == 0:
-        return 0.0  # or use np.finfo(float).eps for a small positive value
-    return np.sum(V[mask] * weights[mask]) / total_weight
-
-
-def tab_disjonctif_NA(df):
-    """Create a disjunctive (one-hot encoded).
-
-    Parameters
-    ----------
-    df : DataFrame
-        Input DataFrame with categorical and numeric variables.
-
-    Returns
-    -------
-    DataFrame
-        Disjunctive table with one-hot encoding.
-
-    """  # noqa: E501
-    df_encoded_list = []
-    for col in df.columns:
-        if df[col].dtype.name == "category" or df[col].dtype == object:
-            df[col] = df[col].astype("category")
-            # Include '__MISSING__' as a category if not already present
-            if "__MISSING__" not in df[col].cat.categories:
-                df[col] = df[col].cat.add_categories(["__MISSING__"])
-            # Fill missing values with '__MISSING__'
-            df[col] = df[col].fillna("__MISSING__")
-            # One-hot encode the categorical variable
-            encoded = pd.get_dummies(
-                df[col],
-                prefix=col,
-                prefix_sep="_",
-                dummy_na=False,
-                dtype=float,
-            )
-            df_encoded_list.append(encoded)
-        else:
-            # Numeric column; keep as is
-            df_encoded_list.append(df[[col]])
-    # Concatenate all encoded columns
-    df_encoded = pd.concat(df_encoded_list, axis=1)
-    return df_encoded
-
-
-def tab_disjonctif_prop(df, seed=None):
-    """Perform probabilistic imputation for categorical columns using observed
-    value distributions, without creating a separate missing category.
-
-    Parameters
-    ----------
-    df : DataFrame
-        DataFrame with categorical columns to impute.
-    seed : int, optional
-        Random seed for reproducibility. Default is None.
-
-    Returns
-    -------
-    DataFrame
-        Disjunctive coded DataFrame with missing values probabilistically
-        imputed.
-
-    """  # noqa: D205
-    if seed is not None:
-        np.random.seed(seed)
-    df = df.copy()
-    df_encoded_list = []
-    for col in df.columns:
-        if df[col].dtype.name == "category" or df[col].dtype == object:
-            # Ensure categories are strings
-            df[col] = df[col].cat.rename_categories(
-                df[col].cat.categories.astype(str)
-            )
-            observed = df[col][df[col].notna()]
-            categories = df[col].cat.categories.tolist()
-            # Get observed frequencies
-            freqs = observed.value_counts(normalize=True)
-            # Impute missing values based on observed frequencies
-            missing_indices = df[col][df[col].isna()].index
-            if len(missing_indices) > 0:
-                imputed_values = np.random.choice(
-                    freqs.index, size=len(missing_indices), p=freqs.values
-                )
-                df.loc[missing_indices, col] = imputed_values
-            # One-hot encode without creating missing category
-            encoded = pd.get_dummies(
-                df[col],
-                prefix=col,
-                prefix_sep="_",
-                dummy_na=False,
-                dtype=float,
-            )
-            col_names = [f"{col}_{cat}" for cat in categories]
-            encoded = encoded.reindex(columns=col_names, fill_value=0.0)
-            df_encoded_list.append(encoded)
-        else:
-            df_encoded_list.append(df[[col]])
-    df_encoded = pd.concat(df_encoded_list, axis=1)
-    return df_encoded
-
-
-def find_category(df_original, tab_disj):
-    """Reconstruct the original categorical variables from the disjunctive.
-
-    Parameters
-    ----------
-    df_original : DataFrame
-        Original DataFrame with categorical variables.
-    tab_disj : DataFrame
-        Disjunctive table after imputation.
-
-    Returns
-    -------
-    DataFrame
-        Reconstructed DataFrame with imputed categorical variables.
-
-    """
-    df_reconstructed = df_original.copy()
-    start_idx = 0
-    for col in df_original.columns:
-        if (
-            df_original[col].dtype.name == "category"
-            or df_original[col].dtype == object
-        ):  # noqa: E501
-            categories = df_original[col].cat.categories.tolist()
-            if "__MISSING__" in categories:
-                missing_cat_index = categories.index("__MISSING__")
-            else:
-                missing_cat_index = None
-            num_categories = len(categories)
-            sub_tab = tab_disj.iloc[:, start_idx : start_idx + num_categories]
-            if missing_cat_index is not None:
-                sub_tab.iloc[:, missing_cat_index] = -np.inf
-            # Find the category with the maximum value for each row
-            max_indices = sub_tab.values.argmax(axis=1)
-            df_reconstructed[col] = [categories[idx] for idx in max_indices]
-            # Replace '__MISSING__' back to NaN
-            df_reconstructed[col].replace("__MISSING__", np.nan, inplace=True)
-            start_idx += num_categories
-        else:
-            # For numeric variables, keep as is
-            start_idx += 1  # Increment start_idx by 1 for numeric columns
-    return df_reconstructed
-
-
-def svdtriplet(X, row_w=None, ncp=np.inf):
-    """Perform weighted SVD on matrix X with row weights.
-
-    Parameters
-    ----------
-    X : ndarray
-        Data matrix of shape (n_samples, n_features).
-    row_w : array-like, optional
-        Row weights. If None, uniform weights are assumed. Default is None.
-    ncp : int
-        Number of principal components to retain. Default is infinity.
-
-    Returns
-    -------
-    s : ndarray
-        Singular values.
-    U : ndarray
-        Left singular vectors.
-    V : ndarray
-        Right singular vectors.
-
-    """
-    if not isinstance(X, np.ndarray):
-        X = np.array(X, dtype=float)
-    else:
-        X = X.astype(float)
-    if row_w is None:
-        row_w = np.ones(X.shape[0]) / X.shape[0]
-    else:
-        row_w = np.array(row_w, dtype=float)
-        row_w /= row_w.sum()
-    ncp = int(min(ncp, X.shape[0] - 1, X.shape[1]))
-    # Apply weights to rows
-    X_weighted = X * np.sqrt(row_w[:, None])
-    # Perform SVD
-    U, s, Vt = np.linalg.svd(X_weighted, full_matrices=False)
-    V = Vt.T
-    U = U[:, :ncp]
-    V = V[:, :ncp]
-    s = s[:ncp]
-    # Adjust signs to ensure consistency
-    mult = np.sign(np.sum(V, axis=0))
-    mult[mult == 0] = 1
-    U *= mult
-    V *= mult
-    # Rescale U by the square root of row weights
-    U /= np.sqrt(row_w[:, None])
-    return s, U, V
+from qolmat.utils.algebra import svdtriplet
+from qolmat.utils.utils import (
+    find_category,
+    moy_p,
+    tab_disjonctif_NA,
+    tab_disjonctif_prop,
+)
 
 
 def imputeMCA(
 
@@ -100,82 +100,51 @@ def kl_divergence_gaussian_exact(
     return div_kl
 
 
-def svdtriplet(
-    X: NDArray[np.float64],
-    row_weights: Optional[NDArray[np.float64]] = None,
-    col_weights: Optional[NDArray[np.float64]] = None,
-    ncp: int = np.inf,
-) -> Tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
-    """Perform a weighted Singular Value Decomposition (SVD) of matrix X.
-
-    This function computes the SVD of a weighted matrix, where weights are
-    applied to both the rows and columns. Row and column weights are optional,
-    and if not provided, uniform weights are applied by default.
+def svdtriplet(X, row_w=None, ncp=np.inf):
+    """Perform weighted SVD on matrix X with row weights.
 
     Parameters
     ----------
-    X : NDArray[np.float64]
-        Input matrix to decompose with SVD.
-    row_weights : Optional[NDArray[np.float64]], optional
-        Weights for the rows of the matrix, by default None (uniform weights).
-    col_weights : Optional[NDArray[np.float64]], optional
-        Weights for the columns of the matrix, by default None (uniform
-        weights).
-    ncp : int, optional
-        The number of components to retain, by default np.inf. This will be
-        capped at min(rows-1, cols).
+    X : ndarray
+        Data matrix of shape (n_samples, n_features).
+    row_w : array-like, optional
+        Row weights. If None, uniform weights are assumed. Default is None.
+    ncp : int
+        Number of principal components to retain. Default is infinity.
 
     Returns
     -------
-    Tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]
-        A tuple containing:
-        - Singular values (s)
-        - Left singular vectors (U)
-        - Right singular vectors (V)
+    s : ndarray
+        Singular values.
+    U : ndarray
+        Left singular vectors.
+    V : ndarray
+        Right singular vectors.
 
     """
-    X = np.asarray(X, dtype=np.float64)
-    if X.ndim != 2:
-        raise ValueError("Input matrix X must be 2-dimensional")
-    n_rows, n_cols = X.shape
-    if row_weights is None:
-        row_weights = np.ones(n_rows) / n_rows
+    if not isinstance(X, np.ndarray):
+        X = np.array(X, dtype=float)
     else:
-        row_weights = np.asarray(row_weights, dtype=np.float64)
-        if row_weights.shape[0] != n_rows:
-            raise ValueError("Row weights must match the number of rows in X")
-
-    if col_weights is None:
-        col_weights = np.ones(n_cols)
-    else:
-        col_weights = np.asarray(col_weights, dtype=np.float64)
-        if col_weights.shape[0] != n_cols:
-            raise ValueError(
-                "Column weights must match the number of columns in X"
-            )
-
-    row_weights /= row_weights.sum()
-    X_weighted = X * np.sqrt(col_weights)  # Column weights
-    X_weighted *= np.sqrt(row_weights[:, None])  # Row weights
-
-    ncp = min(ncp, n_rows - 1, n_cols)
-
-    if n_cols <= n_rows:
-        U, s, Vt = np.linalg.svd(X_weighted, full_matrices=False)
-        V = Vt.T
+        X = X.astype(float)
+    if row_w is None:
+        row_w = np.ones(X.shape[0]) / X.shape[0]
     else:
-        Vt, s, U = np.linalg.svd(X_weighted.T, full_matrices=False)
-        V = Vt.T
-        U = U.T
-
-    # Truncate U, V, and s to the top ncp components
-    U, V, s = U[:, :ncp], V[:, :ncp], s[:ncp]
-
-    sign_correction = np.sign(np.sum(V, axis=0))
-    sign_correction[sign_correction == 0] = 1
-    U *= sign_correction
-    V *= sign_correction
-    U /= np.sqrt(row_weights[:, None])
-    V /= np.sqrt(col_weights[:, None])
-
+        row_w = np.array(row_w, dtype=float)
+        row_w /= row_w.sum()
+    ncp = int(min(ncp, X.shape[0] - 1, X.shape[1]))
+    # Apply weights to rows
+    X_weighted = X * np.sqrt(row_w[:, None])
+    # Perform SVD
+    U, s, Vt = np.linalg.svd(X_weighted, full_matrices=False)
+    V = Vt.T
+    U = U[:, :ncp]
+    V = V[:, :ncp]
+    s = s[:ncp]
+    # Adjust signs to ensure consistency
+    mult = np.sign(np.sum(V, axis=0))
+    mult[mult == 0] = 1
+    U *= mult
+    V *= mult
+    # Rescale U by the square root of row weights
+    U /= np.sqrt(row_w[:, None])
     return s, U, V