-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpca.py
More file actions
137 lines (115 loc) · 4.44 KB
/
pca.py
File metadata and controls
137 lines (115 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from functools import reduce
from numpy.core.fromnumeric import diagonal
from pandas import DataFrame
def get_centered_matrix(x: np.ndarray):
n, _ = x.shape
mean_vector = np.sum(x, axis=0) / n
result = x - mean_vector # numpy broadcasting
return result
def get_covariance_matrix(x: np.ndarray):
n, _ = x.shape
centered_matrix = get_centered_matrix(x)
return np.matmul(centered_matrix.transpose(), centered_matrix) / (n-1)
def get_correlation_matrix(x: np.ndarray):
covmat = get_covariance_matrix(x)
standard_errors = np.sqrt(covmat.diagonal())
return covmat / np.outer(standard_errors, standard_errors)
def get_proportion_of_variance(x: np.ndarray):
result = {}
mat = get_correlation_matrix(x)
trace = np.trace(mat)
eigenval, eigenvec = np.linalg.eig(mat)
proportion_of_varaince = (eigenval / trace).tolist()
cum_proportion = [reduce(lambda x, y: x+y, proportion_of_varaince[:i+1], 0)
for i, _ in enumerate(proportion_of_varaince)]
for i, v in enumerate(cum_proportion):
result[i] = {
"cummulated-proportion": v,
"eigenvalue": eigenval[i],
"eigenvector": eigenvec[i],
}
return result
def get_principle_component_scores(x: np.ndarray, eigenvalueCut=1):
result = []
proportion_dict = get_proportion_of_variance(x)
centered_x = get_centered_matrix(x)
for i, v in proportion_dict.items():
if v["eigenvalue"] > eigenvalueCut:
score = np.matmul(centered_x, v["eigenvector"])
result.append(score)
return np.array(result).transpose()
def compare_correlations_principle_component_scores_and_variables(df: pd.DataFrame, eigenvalueCut=1):
ary = np.array(df)
_, num_vars = ary.shape
pcs = get_principle_component_scores(ary, eigenvalueCut)
pcs_df = pd.DataFrame(pcs)
_, num_pc = pcs_df.shape
pcs_colnames = ['pc'+f'{i}' for i in range(num_pc)]
pcs_df.columns = pcs_colnames
corr_mat = pd.concat([df, pcs_df], axis=1).corr()
return corr_mat.iloc[:num_vars, num_vars:]
def draw_scree_plot(x: np.ndarray):
mat = get_correlation_matrix(x)
eigenval, _ = np.linalg.eig(mat)
import matplotlib.pyplot as plt
plt.title("SCREE PLOT OF EIGENVALUES")
plt.xlabel("index")
plt.plot(eigenval, 'o-')
plt.show()
def get_zero_containing_variables(df: pd.DataFrame):
zero_truth_series = (df == 0).sum() > 0
zero_containing_colnames = zero_truth_series.index[zero_truth_series == True].to_list(
)
return zero_containing_colnames
def add_one_to_zero_containing_variables(df: pd.DataFrame):
cols = get_zero_containing_variables(df)
df[cols] += 1
def log_transformation(df: pd.DataFrame):
add_one_to_zero_containing_variables(df)
new_colnames = ['log'+n for n in df.columns.to_list()]
transformed_df = np.log(df)
transformed_df.columns = new_colnames
return transformed_df
if __name__ == "__main__":
import pandas as pd
navy_df = pd.read_csv(
'navy.dat',
header=None,
index_col=False,
delim_whitespace=True)
navy_df.columns = [
'ID',
'ADO', # avg daily occupancy
'MAC', # avg number of check-ins
'WHR', # weekly hrs of service desk operation
'CUA', # sq ft of common use area
'WINGS', # number of building wings
'OBC', # operational berthing capacity
'RMS', # number of rooms
'MMH' # monthly man-hours required to operate
]
navy_df = navy_df.iloc[:, 1:]
navy_df.head(1) # 25 obs, 8 features(p)
logNavy = log_transformation(navy_df)
logNavy_array = np.array(logNavy)
from pprint import pprint as prnt
prnt(get_proportion_of_variance(logNavy_array))
# selects only one principle component
pcs = get_principle_component_scores(logNavy_array, eigenvalueCut=1)
print(compare_correlations_principle_component_scores_and_variables(
logNavy, eigenvalueCut=1)
)
import scipy.stats as stats
pcs_flattened = pcs.flatten()
from scipy.stats import kstest, shapiro, anderson, cramervonmises
x = pcs_flattened
m = x.mean()
s = x.std()
print("Shapiro-Wilk: ", shapiro(x))
print("KS test: ", kstest(x, 'norm', args=(m, s)))
print("Anderson: ", anderson(x, 'norm'))
print("Cramer Von Mises: ", cramervonmises(x, "norm", args=(m, s)))
(stats.probplot(pcs_flattened, plot=plt))