Skip to content

Commit f02c13e

Browse files
authored
Merge pull request #29 from EducationalTestingService/bugfix/fix-scores
Fix `get_scores()` method, and other issues
2 parents 7c3e3cd + 13a3232 commit f02c13e

File tree

4 files changed

+94
-25
lines changed

4 files changed

+94
-25
lines changed

factor_analyzer/factor_analyzer.py

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,14 @@ class FactorAnalyzer:
148148
rotation_matrix : np.array
149149
The rotation matrix, if a rotation
150150
has been performed.
151-
151+
structure : np.array or None
152+
The structure loading matrix.
153+
This only exists if the rotation
154+
is promax.
155+
psi : np.array or None
156+
The factor correlations
157+
matrix. This only exists
158+
if the rotation is oblique.
152159
Notes
153160
-----
154161
This code was partly derived from the excellent R package
@@ -192,6 +199,9 @@ def __init__(self,
192199
self.loadings = None
193200
self.rotation_matrix = None
194201

202+
self._scale_mean = None
203+
self._scale_std = None
204+
195205
@staticmethod
196206
def _fit_uls_objective(psi, corr_mtx, n_factors):
197207
"""
@@ -510,7 +520,7 @@ def fit_factor_analysis(self,
510520
'Check to make sure you do not have any '
511521
'features with zero standard deviation.')
512522

513-
corr = corr.values
523+
corr = corr.copy().values
514524

515525
# if `use_smc` is True, get get squared multiple correlations
516526
# and use these as initial guesses for optimizer
@@ -569,7 +579,6 @@ def analyze(self,
569579
normalize=True,
570580
impute='median',
571581
remove_non_numeric=True,
572-
use_scaling=True,
573582
use_corr_matrix=False,
574583
**kwargs):
575584
"""
@@ -628,11 +637,6 @@ def analyze(self,
628637
Remove any non-numeric data. If `use_corr_matrix` is True,
629638
no non-numeric data will be removed.
630639
Defaults to True.
631-
use_scaling : bool, optional
632-
Whether to scale the data by subtracting out the mean
633-
and dividing by the standard deviation. If `use_corr_matrix`
634-
is True, scaling will not be performed.
635-
Defaults to True.
636640
use_corr_matrix : bool, optional
637641
Set to true if the `data` is the correlation
638642
matrix.
@@ -694,16 +698,11 @@ def analyze(self,
694698
self.corr = df.copy()
695699
else:
696700
self.corr = df.corr()
697-
698-
# scale the data, if it is not a correlation
699-
# matrix and `use_scaling` is True
700-
if use_scaling and not use_corr_matrix:
701-
X = (df - df.mean(0)) / df.std(0)
702-
else:
703-
X = df.copy()
701+
self._scale_mean = df.mean(0)
702+
self._scale_std = df.std(0)
704703

705704
# fit factor analysis model
706-
loadings = self.fit_factor_analysis(X,
705+
loadings = self.fit_factor_analysis(df.copy(),
707706
n_factors,
708707
use_smc,
709708
bounds,
@@ -805,7 +804,7 @@ def get_eigenvalues(self):
805804
"""
806805
if (self.corr is not None and self.loadings is not None):
807806

808-
corr = self.corr.values
807+
corr = self.corr.copy().values
809808

810809
e_values, _ = sp.linalg.eigh(corr)
811810
e_values = pd.DataFrame(e_values[::-1],
@@ -945,20 +944,46 @@ def get_factor_variance(self):
945944

946945
return variance_info
947946

948-
def get_scores(self, data):
947+
def get_scores(self,
948+
data,
949+
scale_mean=None,
950+
scale_std=None):
949951
"""
950952
Get the factor scores, given the data.
951953
952954
Parameters
953955
----------
954956
data : pd.DataFrame
955957
The data to calculate factor scores.
958+
scale_mean : float or None
959+
The mean of the original
960+
data set used to fit the
961+
factor model. If None, attempt
962+
to retrieve the mean from the
963+
original `analyze()` method,
964+
if it was saved.
965+
Defaults to None.
966+
scale_std : float or None
967+
The standard deviation of the original
968+
data set used to fit the
969+
factor model. If None, attempt
970+
to retrieve the standard deviation from the
971+
original `analyze()` method,
972+
if it was saved.
973+
Defaults to None.
956974
957975
Returns
958976
-------
959977
scores : pd.DataFrame
960978
The factor scores.
961979
980+
Raises
981+
------
982+
ValueError
983+
If either scale_std or scale_mean
984+
is None, and the original mean or standard
985+
deviation were not saved during fitting.
986+
962987
Examples
963988
--------
964989
>>> import pandas as pd
@@ -977,11 +1002,38 @@ def get_scores(self, data):
9771002
if self.loadings is not None:
9781003

9791004
df = data.copy()
980-
corr = data.corr()
1005+
corr = self.corr.copy()
1006+
1007+
error_msg = ('The `{}` argument is None, but no scaled {} '
1008+
'was saved when fitting your original factor '
1009+
'model. This most likely because you used a '
1010+
'correlation matrix, rather than the full data '
1011+
'set. Please either pass a value for `{}` '
1012+
'or re-fit your model using the full data set.')
1013+
1014+
# if no scaled mean is passed, use the mean from the
1015+
# original fitting procedure; otherwise, raise an error
1016+
if scale_mean is None and self._scale_mean is not None:
1017+
scale_mean = self._scale_mean
1018+
elif scale_mean is None and self._scale_mean is None:
1019+
raise ValueError(error_msg.format('scale_mean',
1020+
'mean',
1021+
'scale_mean'))
1022+
1023+
# if no scaled std is passed, use the std from the
1024+
# original fitting procedure; otherwise, raise an error
1025+
if scale_std is None and self._scale_std is not None:
1026+
scale_std = self._scale_std
1027+
elif scale_std is None and self._scale_std is None:
1028+
raise ValueError(error_msg.format('scale_std',
1029+
'standard deviation',
1030+
'scale_std'))
9811031

9821032
# scale the data
983-
X = (df - df.mean(0)) / df.std(0)
1033+
X = (df - scale_mean) / scale_std
9841034

1035+
# use the structure matrix, if it exists;
1036+
# otherwise, just use the loadings matrix
9851037
if self.structure is not None:
9861038
structure = self.structure
9871039
else:

factor_analyzer/rotator.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,10 @@ def oblique(self,
234234
rotation_mtx : np.array
235235
The rotation matrix
236236
(n_factors, n_factors)
237+
psi : np.array or None
238+
The factor correlations
239+
matrix. This only exists
240+
if the rotation is oblique.
237241
"""
238242
df = loadings.copy()
239243

@@ -525,6 +529,10 @@ def promax(self, loadings, normalize=False, power=4):
525529
rotation_mtx : np.array
526530
The rotation matrix
527531
(n_factors, n_factors)
532+
psi : np.array or None
533+
The factor correlations
534+
matrix. This only exists
535+
if the rotation is oblique.
528536
"""
529537
df = loadings.copy()
530538

@@ -619,6 +627,10 @@ def rotate(self, loadings, method='varimax', **kwargs):
619627
rotation_mtx : np.array
620628
The rotation matrix
621629
(n_factors, n_factors)
630+
psi : np.array or None
631+
The factor correlations
632+
matrix. This only exists
633+
if the rotation is oblique.
622634
623635
Raises
624636
------

factor_analyzer/test_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,12 @@ def calculate_py_output(test_name,
7373

7474
if use_corr_matrix:
7575
X = data.corr()
76+
scale_mean = data.mean(0)
77+
scale_std = data.std(0)
7678
else:
7779
X = data.copy()
80+
scale_mean = None
81+
scale_std = None
7882

7983
rotation = None if rotation == 'none' else rotation
8084
method = {'uls': 'minres'}.get(method, method)
@@ -90,7 +94,7 @@ def calculate_py_output(test_name,
9094
'loading': fa.loadings,
9195
'uniquenesses': fa.get_uniqueness(),
9296
'communalities': fa.get_communalities(),
93-
'scores': fa.get_scores(data)}
97+
'scores': fa.get_scores(data, scale_mean, scale_std)}
9498

9599

96100
def collect_r_output(test_name,

tests/test_factor_analyzer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,13 @@ def test_analyze_rotation_value_error(self):
121121
@raises(ValueError)
122122
def test_analyze_infinite(self):
123123

124-
data = pd.DataFrame({'A': [2, 4, 5, 6, 8, 9],
125-
'B': [4, 8, float('inf'), 10, 16, 18],
126-
'C': [6, 12, 15, 12, 26, 27]})
124+
data = pd.DataFrame({'A': [1.0, 0.4, 0.5],
125+
'B': [0.4, 1.0, float('inf')],
126+
'C': [0.5, float('inf'), 1.0]},
127+
index=['A', 'B', 'C'])
127128

128129
fa = FactorAnalyzer()
129-
fa.analyze(data, 1, impute='drop')
130+
fa.analyze(data, 1, impute='drop', use_corr_matrix=True)
130131

131132
def test_remove_all_columns(self):
132133
# test that columns with string values are removed.

0 commit comments

Comments
 (0)