Merge pull request #25 from DataboyUsen/main

statmlben · web-flow · commit 072c846217f6 · 2025-11-03T21:25:10.000+08:00
MF formal pull request 1.2
diff --git a/doc/source/examples/MF.ipynb b/doc/source/examples/MF.ipynb
@@ -82,14 +82,14 @@
    "source": [
     "## simulate data\n",
     "import numpy as np\n",
-    "from rehline import plqMF_Ridge, make_ratings\n",
+    "from rehline import plqMF_Ridge, make_mf_dataset\n",
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
     "import matplotlib.pyplot as plt\n",
     "from sklearn.metrics import confusion_matrix\n",
     "\n",
     "user_num, item_num = 1200, 4000 \n",
-    "ratings = make_ratings(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42) \n",
+    "ratings = make_mf_dataset(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42) \n",
     "X_train, X_test, y_train, y_test = train_test_split(ratings['X'], ratings['y'], test_size=0.3, random_state=42)"
    ]
   },
diff --git a/doc/source/tutorials/ReHLine_MF.rst b/doc/source/tutorials/ReHLine_MF.rst
@@ -90,16 +90,16 @@ Basic Usage
 
    # 1. Necessary Packages
    import numpy as np
-   from rehline import plqMF_Ridge, make_ratings
+   from rehline import plqMF_Ridge, make_mf_dataset
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error
 
 
    # 2. Data Preparation
    # Generate synthetic data (replace with your own data in practice)
    user_num, item_num = 1200, 4000 
-   ratings = make_ratings(n_users=user_num, n_items=item_num, 
-                         n_interactions=50000, seed=42)
+   ratings = make_mf_dataset(n_users=user_num, n_items=item_num, 
+                             n_interactions=50000, seed=42)
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
diff --git a/rehline/__init__.py b/rehline/__init__.py
@@ -6,7 +6,7 @@
 from ._path_sol import plqERM_Ridge_path_sol
 from ._sklearn_mixin import plq_Ridge_Classifier, plq_Ridge_Regressor
 from ._mf_class import plqMF_Ridge
-from ._data import make_ratings 
+from ._data import make_mf_dataset 
 
 __all__ = ("_BaseReHLine",
            "ReHLine_solver",
@@ -19,4 +19,4 @@
            "plq_Ridge_Regressor",
            "_make_loss_rehline_param",
            "_make_constraint_rehline_param",
-           "make_ratings")
+           "make_mf_dataset")
diff --git a/rehline/_data.py b/rehline/_data.py
@@ -42,62 +42,80 @@ def make_fair_classification(n_samples=100, n_features=5, ind_sensitive=0):
     return X, y, X_sen
 
 
-def make_ratings(n_users, n_items, n_factors=20,
-             n_interactions=None, density=0.01, 
-             noise_std=0.1, seed=None, 
-             rating_min=1.0, rating_max=5.0, return_params=True):
+def make_mf_dataset(n_users, n_items, n_factors=20,
+                     n_interactions=None, density=0.01, 
+                     noise_std=0.1, seed=None, 
+                     rating_min=1.0, rating_max=5.0, return_params=True):
     """
     Generate synthetic rating data using matrix factorization model.
-    
+
     Creates synthetic user-item rating data based on the matrix factorization
     approach commonly used in recommender systems. The ratings are generated
     as: rating = mu + user_bias + item_bias + user_factor * item_factor + noise
-    
+
     Parameters
     ----------
     n_users : int
         Number of users in the synthetic dataset
+
     n_items : int
         Number of items in the synthetic dataset
+
     n_factors : int, default=20
         Number of latent factors for user and item embeddings
+
     n_interactions : int, optional
         Exact number of user-item interactions. If None, calculated as density * total_pairs
+
     density : float, default=0.01
         Density of the rating matrix (ignored if n_interactions is specified)
+
     noise_std : float, default=0.1
         Standard deviation of Gaussian noise added to ratings
+
     seed : int, optional
         Random seed for reproducible results
+
     rating_min : float, default=1.0
         Minimum possible rating value
+
     rating_max : float, default=5.0
         Maximum possible rating value
+
     return_params : bool, default=True
         If True, returns the underlying model parameters (P, Q, bu, bi, mu)
-    
+
     Returns
     -------
     dict
         Dictionary containing:
-        - 'X' : ndarray of shape (n_interactions, 2)
+        
+        - **X** : ndarray of shape (n_interactions, 2)
             User-item pairs where X[:, 0] are user indices and X[:, 1] are item indices
-        - 'y' : ndarray of shape (n_interactions,)
+        - **y** : ndarray of shape (n_interactions,)
             Synthetic ratings for each user-item pair
-        - 'params' : dict, optional
+        - **params** : dict, optional
             Only returned if return_params=True. Contains:
-            * 'P' : ndarray of shape (n_users, n_factors) - User factor matrix
-            * 'Q' : ndarray of shape (n_items, n_factors) - Item factor matrix  
-            * 'bu' : ndarray of shape (n_users,) - User biases
-            * 'bi' : ndarray of shape (n_items,) - Item biases
-            * 'mu' : float - Global mean rating
-    
+            
+            * **P** : ndarray of shape (n_users, n_factors)
+                User factor matrix
+            * **Q** : ndarray of shape (n_items, n_factors)
+                Item factor matrix  
+            * **bu** : ndarray of shape (n_users,)
+                User biases
+            * **bi** : ndarray of shape (n_items,)
+                Item biases
+            * **mu** : float
+                Global mean rating
+
     Notes
     -----
     The rating generation follows the standard matrix factorization model:
+
         r_ui = μ + b_u + b_i + p_u · q_i^T + ε
-    where ε ~ N(0, noise_std²)
-    
+
+        where ε ~ N(0, noise_std²)
+
     The generated ratings are clipped to stay within [rating_min, rating_max] range.
     """
     rng = np.random.RandomState(seed)
diff --git a/rehline/_sklearn_mixin.py b/rehline/_sklearn_mixin.py
@@ -84,13 +84,13 @@ class plq_Ridge_Classifier(plqERM_Ridge, ClassifierMixin):
 
     Attributes
     ----------
-    coef_ : ndarray of shape (n_features,)
+    coef\_ : ndarray of shape (n_features,)
         Coefficients excluding the intercept.
 
-    intercept_ : float
+    intercept\_ : float
         Intercept term. 0.0 if ``fit_intercept=False``.
 
-    classes_ : ndarray of shape (2,)
+    classes\_ : ndarray of shape (2,)
         Unique class labels in the original label space.
 
     _label_encoder : LabelEncoder
@@ -293,6 +293,7 @@ class plq_Ridge_Regressor(plqERM_Ridge, RegressorMixin):
           - ``{'name': 'nonnegative'}`` or ``{'name': '>=0'}``
           - ``{'name': 'fair', 'sen_idx': list[int], 'tol_sen': list[float]}``
           - ``{'name': 'custom', 'A': ndarray[K, d], 'b': ndarray[K]}``
+          
         Note: when ``fit_intercept=True``, a constant column is appended **as the last column**;
         since you index sensitive columns by ``sen_idx`` on the *original* features, indices stay valid.
     C : float, default=1.0
@@ -322,11 +323,11 @@ class plq_Ridge_Regressor(plqERM_Ridge, RegressorMixin):
 
     Attributes
     ----------
-    coef_ : ndarray of shape (n_features,)
+    coef\_ : ndarray of shape (n_features,)
         Learned linear coefficients (excluding the intercept term).
-    intercept_ : float
+    intercept\_ : float
         Intercept term extracted from the last coefficient when ``fit_intercept=True``, otherwise 0.0.
-    n_features_in_ : int
+    n_features_in\_ : int
         Number of input features seen during :meth:`fit` (before intercept augmentation).
 
     Notes
@@ -389,9 +390,9 @@ def fit(self, X, y, sample_weight=None):
         X : ndarray of shape (n_samples, n_features)
             Training design matrix (dense). Sparse inputs are not supported.
         y : ndarray of shape (n_samples,)
-        Target values.
+            Target values.
         sample_weight : ndarray of shape (n_samples,), default=None
-        Optional per-sample weights; forwarded to the underlying solver.
+            Optional per-sample weights; forwarded to the underlying solver.
 
         Returns
         -------
@@ -422,7 +423,7 @@ def fit(self, X, y, sample_weight=None):
         return self
 
     def decision_function(self, X):
-        """Compute f(X) = X @ coef_ + intercept_.
+        """Compute f(X) = X @ coef\_ + intercept\_.
 
         Parameters
         ----------
diff --git a/tests/_test_mf.py b/tests/_test_mf.py
@@ -1,6 +1,6 @@
 '''Test MF on simulated dataset'''
 import numpy as np
-from rehline import plqMF_Ridge, make_ratings
+from rehline import plqMF_Ridge, make_mf_dataset
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
 from joblib import Parallel, delayed
@@ -9,7 +9,7 @@
 
 ## Data Preparation
 user_num, item_num = 1200, 4000
-ratings = make_ratings(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42)
+ratings = make_mf_dataset(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42)
 X_train, X_test, y_train, y_test = train_test_split(ratings['X'], ratings['y'], test_size=0.3, random_state=42)
 n_jobs = -1
 verbose = 0
@@ -197,4 +197,4 @@ def evaluate_single_params(params):
 for param, value in best_params.items():
     print(f"  {param:12}: {value}")
 print(f"\nBest Validation Accuracy: {best_acc:.4f}")
-print("="*50)
+print("="*50)