Skip to content

Commit 072c846

Browse files
authored
Merge pull request #25 from DataboyUsen/main
MF formal pull request 1.2
2 parents 7d4a92b + 958655f commit 072c846

File tree

6 files changed

+56
-37
lines changed

6 files changed

+56
-37
lines changed

doc/source/examples/MF.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,14 @@
8282
"source": [
8383
"## simulate data\n",
8484
"import numpy as np\n",
85-
"from rehline import plqMF_Ridge, make_ratings\n",
85+
"from rehline import plqMF_Ridge, make_mf_dataset\n",
8686
"from sklearn.model_selection import train_test_split\n",
8787
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
8888
"import matplotlib.pyplot as plt\n",
8989
"from sklearn.metrics import confusion_matrix\n",
9090
"\n",
9191
"user_num, item_num = 1200, 4000 \n",
92-
"ratings = make_ratings(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42) \n",
92+
"ratings = make_mf_dataset(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42) \n",
9393
"X_train, X_test, y_train, y_test = train_test_split(ratings['X'], ratings['y'], test_size=0.3, random_state=42)"
9494
]
9595
},

doc/source/tutorials/ReHLine_MF.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,16 +90,16 @@ Basic Usage
9090
9191
# 1. Necessary Packages
9292
import numpy as np
93-
from rehline import plqMF_Ridge, make_ratings
93+
from rehline import plqMF_Ridge, make_mf_dataset
9494
from sklearn.model_selection import train_test_split
9595
from sklearn.metrics import mean_absolute_error
9696
9797
9898
# 2. Data Preparation
9999
# Generate synthetic data (replace with your own data in practice)
100100
user_num, item_num = 1200, 4000
101-
ratings = make_ratings(n_users=user_num, n_items=item_num,
102-
n_interactions=50000, seed=42)
101+
ratings = make_mf_dataset(n_users=user_num, n_items=item_num,
102+
n_interactions=50000, seed=42)
103103
104104
# Split into training and testing sets
105105
X_train, X_test, y_train, y_test = train_test_split(

rehline/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from ._path_sol import plqERM_Ridge_path_sol
77
from ._sklearn_mixin import plq_Ridge_Classifier, plq_Ridge_Regressor
88
from ._mf_class import plqMF_Ridge
9-
from ._data import make_ratings
9+
from ._data import make_mf_dataset
1010

1111
__all__ = ("_BaseReHLine",
1212
"ReHLine_solver",
@@ -19,4 +19,4 @@
1919
"plq_Ridge_Regressor",
2020
"_make_loss_rehline_param",
2121
"_make_constraint_rehline_param",
22-
"make_ratings")
22+
"make_mf_dataset")

rehline/_data.py

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -42,62 +42,80 @@ def make_fair_classification(n_samples=100, n_features=5, ind_sensitive=0):
4242
return X, y, X_sen
4343

4444

45-
def make_ratings(n_users, n_items, n_factors=20,
46-
n_interactions=None, density=0.01,
47-
noise_std=0.1, seed=None,
48-
rating_min=1.0, rating_max=5.0, return_params=True):
45+
def make_mf_dataset(n_users, n_items, n_factors=20,
46+
n_interactions=None, density=0.01,
47+
noise_std=0.1, seed=None,
48+
rating_min=1.0, rating_max=5.0, return_params=True):
4949
"""
5050
Generate synthetic rating data using matrix factorization model.
51-
51+
5252
Creates synthetic user-item rating data based on the matrix factorization
5353
approach commonly used in recommender systems. The ratings are generated
5454
as: rating = mu + user_bias + item_bias + user_factor * item_factor + noise
55-
55+
5656
Parameters
5757
----------
5858
n_users : int
5959
Number of users in the synthetic dataset
60+
6061
n_items : int
6162
Number of items in the synthetic dataset
63+
6264
n_factors : int, default=20
6365
Number of latent factors for user and item embeddings
66+
6467
n_interactions : int, optional
6568
Exact number of user-item interactions. If None, calculated as density * total_pairs
69+
6670
density : float, default=0.01
6771
Density of the rating matrix (ignored if n_interactions is specified)
72+
6873
noise_std : float, default=0.1
6974
Standard deviation of Gaussian noise added to ratings
75+
7076
seed : int, optional
7177
Random seed for reproducible results
78+
7279
rating_min : float, default=1.0
7380
Minimum possible rating value
81+
7482
rating_max : float, default=5.0
7583
Maximum possible rating value
84+
7685
return_params : bool, default=True
7786
If True, returns the underlying model parameters (P, Q, bu, bi, mu)
78-
87+
7988
Returns
8089
-------
8190
dict
8291
Dictionary containing:
83-
- 'X' : ndarray of shape (n_interactions, 2)
92+
93+
- **X** : ndarray of shape (n_interactions, 2)
8494
User-item pairs where X[:, 0] are user indices and X[:, 1] are item indices
85-
- 'y' : ndarray of shape (n_interactions,)
95+
- **y** : ndarray of shape (n_interactions,)
8696
Synthetic ratings for each user-item pair
87-
- 'params' : dict, optional
97+
- **params** : dict, optional
8898
Only returned if return_params=True. Contains:
89-
* 'P' : ndarray of shape (n_users, n_factors) - User factor matrix
90-
* 'Q' : ndarray of shape (n_items, n_factors) - Item factor matrix
91-
* 'bu' : ndarray of shape (n_users,) - User biases
92-
* 'bi' : ndarray of shape (n_items,) - Item biases
93-
* 'mu' : float - Global mean rating
94-
99+
100+
* **P** : ndarray of shape (n_users, n_factors)
101+
User factor matrix
102+
* **Q** : ndarray of shape (n_items, n_factors)
103+
Item factor matrix
104+
* **bu** : ndarray of shape (n_users,)
105+
User biases
106+
* **bi** : ndarray of shape (n_items,)
107+
Item biases
108+
* **mu** : float
109+
Global mean rating
110+
95111
Notes
96112
-----
97113
The rating generation follows the standard matrix factorization model:
114+
98115
r_ui = μ + b_u + b_i + p_u · q_i^T + ε
99-
where ε ~ N(0, noise_std²)
100-
116+
117+
where ε ~ N(0, noise_std²)
118+
101119
The generated ratings are clipped to stay within [rating_min, rating_max] range.
102120
"""
103121
rng = np.random.RandomState(seed)

rehline/_sklearn_mixin.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,13 @@ class plq_Ridge_Classifier(plqERM_Ridge, ClassifierMixin):
8484
8585
Attributes
8686
----------
87-
coef_ : ndarray of shape (n_features,)
87+
coef\_ : ndarray of shape (n_features,)
8888
Coefficients excluding the intercept.
8989
90-
intercept_ : float
90+
intercept\_ : float
9191
Intercept term. 0.0 if ``fit_intercept=False``.
9292
93-
classes_ : ndarray of shape (2,)
93+
classes\_ : ndarray of shape (2,)
9494
Unique class labels in the original label space.
9595
9696
_label_encoder : LabelEncoder
@@ -293,6 +293,7 @@ class plq_Ridge_Regressor(plqERM_Ridge, RegressorMixin):
293293
- ``{'name': 'nonnegative'}`` or ``{'name': '>=0'}``
294294
- ``{'name': 'fair', 'sen_idx': list[int], 'tol_sen': list[float]}``
295295
- ``{'name': 'custom', 'A': ndarray[K, d], 'b': ndarray[K]}``
296+
296297
Note: when ``fit_intercept=True``, a constant column is appended **as the last column**;
297298
since you index sensitive columns by ``sen_idx`` on the *original* features, indices stay valid.
298299
C : float, default=1.0
@@ -322,11 +323,11 @@ class plq_Ridge_Regressor(plqERM_Ridge, RegressorMixin):
322323
323324
Attributes
324325
----------
325-
coef_ : ndarray of shape (n_features,)
326+
coef\_ : ndarray of shape (n_features,)
326327
Learned linear coefficients (excluding the intercept term).
327-
intercept_ : float
328+
intercept\_ : float
328329
Intercept term extracted from the last coefficient when ``fit_intercept=True``, otherwise 0.0.
329-
n_features_in_ : int
330+
n_features_in\_ : int
330331
Number of input features seen during :meth:`fit` (before intercept augmentation).
331332
332333
Notes
@@ -389,9 +390,9 @@ def fit(self, X, y, sample_weight=None):
389390
X : ndarray of shape (n_samples, n_features)
390391
Training design matrix (dense). Sparse inputs are not supported.
391392
y : ndarray of shape (n_samples,)
392-
Target values.
393+
Target values.
393394
sample_weight : ndarray of shape (n_samples,), default=None
394-
Optional per-sample weights; forwarded to the underlying solver.
395+
Optional per-sample weights; forwarded to the underlying solver.
395396
396397
Returns
397398
-------
@@ -422,7 +423,7 @@ def fit(self, X, y, sample_weight=None):
422423
return self
423424

424425
def decision_function(self, X):
425-
"""Compute f(X) = X @ coef_ + intercept_.
426+
"""Compute f(X) = X @ coef\_ + intercept\_.
426427
427428
Parameters
428429
----------

tests/_test_mf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
'''Test MF on simulated dataset'''
22
import numpy as np
3-
from rehline import plqMF_Ridge, make_ratings
3+
from rehline import plqMF_Ridge, make_mf_dataset
44
from sklearn.model_selection import train_test_split
55
from sklearn.metrics import accuracy_score
66
from joblib import Parallel, delayed
@@ -9,7 +9,7 @@
99

1010
## Data Preparation
1111
user_num, item_num = 1200, 4000
12-
ratings = make_ratings(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42)
12+
ratings = make_mf_dataset(n_users=user_num, n_items=item_num, n_interactions=50000, seed=42)
1313
X_train, X_test, y_train, y_test = train_test_split(ratings['X'], ratings['y'], test_size=0.3, random_state=42)
1414
n_jobs = -1
1515
verbose = 0
@@ -197,4 +197,4 @@ def evaluate_single_params(params):
197197
for param, value in best_params.items():
198198
print(f" {param:12}: {value}")
199199
print(f"\nBest Validation Accuracy: {best_acc:.4f}")
200-
print("="*50)
200+
print("="*50)

0 commit comments

Comments
 (0)