Skip to content

Commit 0c74e5c

Browse files
author
jbiggsets
committed
fix the scores method
1 parent d147700 commit 0c74e5c

File tree

4 files changed

+94
-25
lines changed

4 files changed

+94
-25
lines changed

factor_analyzer/factor_analyzer.py

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,14 @@ class FactorAnalyzer:
226226
rotation_matrix : np.array
227227
The rotation matrix, if a rotation
228228
has been performed.
229-
229+
structure : np.array or None
230+
The structure loading matrix.
231+
This only exists if the rotation
232+
is promax.
233+
psi : np.array or None
234+
The factor correlations
235+
matrix. This only exists
236+
if the rotation is oblique.
230237
Notes
231238
-----
232239
This code was partly derived from the excellent R package
@@ -270,6 +277,9 @@ def __init__(self,
270277
self.loadings = None
271278
self.rotation_matrix = None
272279

280+
self._scale_mean = None
281+
self._scale_std = None
282+
273283
@staticmethod
274284
def _fit_uls_objective(psi, corr_mtx, n_factors):
275285
"""
@@ -588,7 +598,7 @@ def fit_factor_analysis(self,
588598
'Check to make sure you do not have any '
589599
'features with zero standard deviation.')
590600

591-
corr = corr.values
601+
corr = corr.copy().values
592602

593603
# if `use_smc` is True, get get squared multiple correlations
594604
# and use these as initial guesses for optimizer
@@ -647,7 +657,6 @@ def analyze(self,
647657
normalize=True,
648658
impute='median',
649659
remove_non_numeric=True,
650-
use_scaling=True,
651660
use_corr_matrix=False,
652661
**kwargs):
653662
"""
@@ -706,11 +715,6 @@ def analyze(self,
706715
Remove any non-numeric data. If `use_corr_matrix` is True,
707716
no non-numeric data will be removed.
708717
Defaults to True.
709-
use_scaling : bool, optional
710-
Whether to scale the data by subtracting out the mean
711-
and dividing by the standard deviation. If `use_corr_matrix`
712-
is True, scaling will not be performed.
713-
Defaults to True.
714718
use_corr_matrix : bool, optional
715719
Set to true if the `data` is the correlation
716720
matrix.
@@ -772,16 +776,11 @@ def analyze(self,
772776
self.corr = df.copy()
773777
else:
774778
self.corr = df.corr()
775-
776-
# scale the data, if it is not a correlation
777-
# matrix and `use_scaling` is True
778-
if use_scaling and not use_corr_matrix:
779-
X = (df - df.mean(0)) / df.std(0)
780-
else:
781-
X = df.copy()
779+
self._scale_mean = df.mean(0)
780+
self._scale_std = df.std(0)
782781

783782
# fit factor analysis model
784-
loadings = self.fit_factor_analysis(X,
783+
loadings = self.fit_factor_analysis(df.copy(),
785784
n_factors,
786785
use_smc,
787786
bounds,
@@ -883,7 +882,7 @@ def get_eigenvalues(self):
883882
"""
884883
if (self.corr is not None and self.loadings is not None):
885884

886-
corr = self.corr.values
885+
corr = self.corr.copy().values
887886

888887
e_values, _ = sp.linalg.eigh(corr)
889888
e_values = pd.DataFrame(e_values[::-1],
@@ -1023,20 +1022,46 @@ def get_factor_variance(self):
10231022

10241023
return variance_info
10251024

1026-
def get_scores(self, data):
1025+
def get_scores(self,
1026+
data,
1027+
scale_mean=None,
1028+
scale_std=None):
10271029
"""
10281030
Get the factor scores, given the data.
10291031
10301032
Parameters
10311033
----------
10321034
data : pd.DataFrame
10331035
The data to calculate factor scores.
1036+
scale_mean : float or None
1037+
The mean of the original
1038+
data set used to fit the
1039+
factor model. If None, attempt
1040+
to retrieve the mean from the
1041+
original `analyze()` method,
1042+
if it was saved.
1043+
Defaults to None.
1044+
scale_std : float or None
1045+
The standard deviation of the original
1046+
data set used to fit the
1047+
factor model. If None, attempt
1048+
to retrieve the standard deviation from the
1049+
original `analyze()` method,
1050+
if it was saved.
1051+
Defaults to None.
10341052
10351053
Returns
10361054
-------
10371055
scores : pd.DataFrame
10381056
The factor scores.
10391057
1058+
Raises
1059+
------
1060+
ValueError
1061+
If either scale_std or scale_mean
1062+
is None, and the original mean or standard
1063+
deviation were not saved during fitting.
1064+
10401065
Examples
10411066
--------
10421067
>>> import pandas as pd
@@ -1055,11 +1080,38 @@ def get_scores(self, data):
10551080
if self.loadings is not None:
10561081

10571082
df = data.copy()
1058-
corr = data.corr()
1083+
corr = self.corr.copy()
1084+
1085+
error_msg = ('The `{}` argument is None, but no scaled {} '
1086+
'was saved when fitting your original factor '
1087+
'model. This most likely because you used a '
1088+
'correlation matrix, rather than the full data '
1089+
'set. Please either pass a value for `{}` '
1090+
'or re-fit your model using the full data set.')
1091+
1092+
# if no scaled mean is passed, use the mean from the
1093+
# original fitting procedure; otherwise, raise an error
1094+
if scale_mean is None and self._scale_mean is not None:
1095+
scale_mean = self._scale_mean
1096+
elif scale_mean is None and self._scale_mean is None:
1097+
raise ValueError(error_msg.format('scale_mean',
1098+
'mean',
1099+
'scale_mean'))
1100+
1101+
# if no scaled std is passed, use the std from the
1102+
# original fitting procedure; otherwise, raise an error
1103+
if scale_std is None and self._scale_std is not None:
1104+
scale_std = self._scale_std
1105+
elif scale_std is None and self._scale_std is None:
1106+
raise ValueError(error_msg.format('scale_std',
1107+
'standard deviation',
1108+
'scale_std'))
10591109

10601110
# scale the data
1061-
X = (df - df.mean(0)) / df.std(0)
1111+
X = (df - scale_mean) / scale_std
10621112

1113+
# use the structure matrix, if it exists;
1114+
# otherwise, just use the loadings matrix
10631115
if self.structure is not None:
10641116
structure = self.structure
10651117
else:

factor_analyzer/rotator.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,10 @@ def oblique(self,
235235
rotation_mtx : np.array
236236
The rotation matrix
237237
(n_factors, n_factors)
238+
psi : np.array or None
239+
The factor correlations
240+
matrix. This only exists
241+
if the rotation is oblique.
238242
"""
239243
df = loadings.copy()
240244

@@ -526,6 +530,10 @@ def promax(self, loadings, normalize=False, power=4):
526530
rotation_mtx : np.array
527531
The rotation matrix
528532
(n_factors, n_factors)
533+
psi : np.array or None
534+
The factor correlations
535+
matrix. This only exists
536+
if the rotation is oblique.
529537
"""
530538
df = loadings.copy()
531539

@@ -620,6 +628,10 @@ def rotate(self, loadings, method='varimax', **kwargs):
620628
rotation_mtx : np.array
621629
The rotation matrix
622630
(n_factors, n_factors)
631+
psi : np.array or None
632+
The factor correlations
633+
matrix. This only exists
634+
if the rotation is oblique.
623635
624636
Raises
625637
------

factor_analyzer/test_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,12 @@ def calculate_py_output(test_name,
7272

7373
if use_corr_matrix:
7474
X = data.corr()
75+
scale_mean = data.mean(0)
76+
scale_std = data.std(0)
7577
else:
7678
X = data.copy()
79+
scale_mean = None
80+
scale_std = None
7781

7882
rotation = None if rotation == 'none' else rotation
7983
method = {'uls': 'minres'}.get(method, method)
@@ -89,7 +93,7 @@ def calculate_py_output(test_name,
8993
'loading': fa.loadings,
9094
'uniquenesses': fa.get_uniqueness(),
9195
'communalities': fa.get_communalities(),
92-
'scores': fa.get_scores(data)}
96+
'scores': fa.get_scores(data, scale_mean, scale_std)}
9397

9498

9599
def collect_r_output(test_name,

tests/test_factor_analyzer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,12 +201,13 @@ def test_analyze_rotation_value_error(self):
201201
@raises(ValueError)
202202
def test_analyze_infinite(self):
203203

204-
data = pd.DataFrame({'A': [2, 4, 5, 6, 8, 9],
205-
'B': [4, 8, float('inf'), 10, 16, 18],
206-
'C': [6, 12, 15, 12, 26, 27]})
204+
data = pd.DataFrame({'A': [1.0, 0.4, 0.5],
205+
'B': [0.4, 1.0, float('inf')],
206+
'C': [0.5, float('inf'), 1.0]},
207+
index=['A', 'B', 'C'])
207208

208209
fa = FactorAnalyzer()
209-
fa.analyze(data, 1, impute='drop')
210+
fa.analyze(data, 1, impute='drop', use_corr_matrix=True)
210211

211212
def test_remove_all_columns(self):
212213
# test that columns with string values are removed.

0 commit comments

Comments
 (0)