Skip to content

Commit 026d903

Browse files
authored
Merge pull request #3 from acreegan/update_PCA_Embedder_clean
Update pca embedder clean
2 parents 077c588 + 2b1a48e commit 026d903

File tree

3 files changed

+363
-58
lines changed

3 files changed

+363
-58
lines changed

Python/DataAugmentationUtilsPackage/DataAugmentationUtils/Embedder.py

Lines changed: 203 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import numpy as np
77
from abc import ABC, abstractmethod
88
from shapeworks.utils import sw_message
9+
from pathlib import Path
10+
from glob import glob
911

1012
# abstract base class for embedders
1113
class Embedder(ABC):
@@ -19,61 +21,204 @@ def project(self, PCA_instance):
1921

2022
# instance of embedder that uses PCA for dimension reduction
2123
class PCA_Embbeder(Embedder):
22-
# overriding abstract methods
23-
def __init__(self, data_matrix, num_dim=0, percent_variability=0.95):
24-
self.data_matrix = data_matrix
25-
num_dim = self.run_PCA(num_dim, percent_variability)
26-
self.num_dim = num_dim
27-
# run PCA on data_matrix for PCA_Embedder
28-
def run_PCA(self, num_dim, percent_variability):
29-
# get covariance matrix (uses compact trick)
30-
N = self.data_matrix.shape[0]
31-
data_matrix_2d = self.data_matrix.reshape(self.data_matrix.shape[0], -1).T # flatten data instances and transpose
32-
mean = np.mean(data_matrix_2d, axis=1)
33-
centered_data_matrix_2d = (data_matrix_2d.T - mean).T
34-
trick_cov_matrix = np.dot(centered_data_matrix_2d.T,centered_data_matrix_2d) * 1.0/np.sqrt(N-1)
35-
# get eignevectors and eigenvalues
36-
eigen_values, eigen_vectors = np.linalg.eigh(trick_cov_matrix)
37-
eigen_vectors = np.dot(centered_data_matrix_2d, eigen_vectors)
38-
for i in range(N):
39-
eigen_vectors[:,i] = eigen_vectors[:,i]/np.linalg.norm(eigen_vectors[:,i])
40-
eigen_values = np.flip(eigen_values)
41-
eigen_vectors = np.flip(eigen_vectors, 1)
42-
# get num PCA components
43-
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
44-
if num_dim == 0:
45-
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
46-
num_dim = np.where(cumDst > float(percent_variability))[0][0] + 1
47-
W = eigen_vectors[:, :num_dim]
48-
PCA_scores = np.matmul(centered_data_matrix_2d.T, W)
49-
sw_message(f"The PCA modes of particles being retained : {num_dim}")
50-
sw_message(f"Variablity preserved: {str(float(cumDst[num_dim-1]))}")
51-
self.num_dim = num_dim
52-
self.PCA_scores = PCA_scores
53-
self.eigen_vectors = eigen_vectors
54-
self.eigen_values = eigen_values
55-
return num_dim
56-
# write PCA info to files
57-
# @TODO do we need all of this?
58-
def write_PCA(self, out_dir, suffix):
59-
if not os.path.exists(out_dir):
60-
os.makedirs(out_dir)
61-
np.save(out_dir + 'original_PCA_scores.npy', self.PCA_scores)
62-
mean = np.mean(self.data_matrix, axis=0)
63-
np.savetxt(out_dir + 'mean.' + suffix, mean)
64-
np.savetxt(out_dir + 'eigenvalues.txt', self.eigen_values)
65-
for i in range(self.data_matrix.shape[0]):
66-
nm = out_dir + 'pcamode' + str(i) + '.' + suffix
67-
data = self.eigen_vectors[:, i]
68-
data = data.reshape(self.data_matrix.shape[1:])
69-
np.savetxt(nm, data)
70-
# returns embedded form of dtat_matrix
71-
def getEmbeddedMatrix(self):
72-
return self.PCA_scores
73-
# projects embbed array into data
74-
def project(self, PCA_instance):
75-
W = self.eigen_vectors[:, :self.num_dim].T
76-
mean = np.mean(self.data_matrix, axis=0)
77-
data_instance = np.matmul(PCA_instance, W) + mean.reshape(-1)
78-
data_instance = data_instance.reshape((self.data_matrix.shape[1:]))
79-
return data_instance
24+
def __init__(self, data_matrix=None, num_dim=0, percent_variability=0.95):
25+
"""
26+
Initialize the PCA_Embedder. If data_matrix is provided, a PCA model is generated.
27+
Otherwise, the attributes defining the model are initialized as None. A model can then be initialized from arrays
28+
using load_pca.
29+
30+
Parameters
31+
----------
32+
data_matrix
33+
Data to use to generate a PCA model
34+
num_dim
35+
Number of PCA dimensions to keep in the generated PCA scores. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
36+
modes of variation is one less than the number of samples used to build the model.
37+
If set to zero, the maximum number of dimensions are kept.
38+
percent_variability
39+
Percentage of the variation in the input data to keep in the generated PCA scores, scaled to between 0 and 1.
40+
This is only used if num_dim is not set.
41+
"""
42+
super().__init__(data_matrix)
43+
self.PCA_scores = None
44+
self.eigen_vectors = None
45+
self.eigen_values = None
46+
47+
if data_matrix is not None:
48+
self.data_matrix = data_matrix
49+
self.mean_data = np.mean(self.data_matrix, axis=0)
50+
self.run_PCA(num_dim, percent_variability)
51+
52+
def run_PCA(self, num_dim, percent_variability):
53+
"""
54+
Perform principal component analysis on the data_matrix.
55+
56+
Parameters
57+
----------
58+
num_dim
59+
Number of PCA dimensions to keep in the model. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
60+
modes of variation is one less than the number of samples used to build the model.
61+
If set to zero, the maximum number of dimensions are kept.
62+
percent_variability
63+
Percentage of the variation in the input data to keep in the model, scaled to between 0 and 1.
64+
This is only used if num_dim is not set.
65+
66+
Returns
67+
-------
68+
num_dim
69+
num_dim actually used
70+
"""
71+
# get covariance matrix (uses compact trick)
72+
N = self.data_matrix.shape[0]
73+
data_matrix_2d = self.data_matrix.reshape(self.data_matrix.shape[0],
74+
-1).T # flatten data instances and transpose
75+
mean = np.mean(data_matrix_2d, axis=1)
76+
centered_data_matrix_2d = (data_matrix_2d.T - mean).T
77+
trick_cov_matrix = np.dot(centered_data_matrix_2d.T, centered_data_matrix_2d) * 1.0 / np.sqrt(N - 1)
78+
# get eignevectors and eigenvalues
79+
eigen_values, eigen_vectors = np.linalg.eigh(trick_cov_matrix)
80+
eigen_vectors = np.dot(centered_data_matrix_2d, eigen_vectors)
81+
for i in range(N):
82+
eigen_vectors[:, i] = eigen_vectors[:, i] / np.linalg.norm(eigen_vectors[:, i])
83+
eigen_values = np.flip(eigen_values)
84+
eigen_vectors = np.flip(eigen_vectors, 1)
85+
# get num PCA components
86+
# Note that the number of the eigen_values and eigen_vectors is equal to the dimension of the data
87+
# matrix, but the last column is not used in the model because it describes no variation.
88+
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
89+
if num_dim == 0:
90+
num_dim = np.where(cumDst >= float(percent_variability))[0][0] + 1
91+
W = eigen_vectors[:, :num_dim]
92+
PCA_scores = np.matmul(centered_data_matrix_2d.T, W)
93+
sw_message(f"The PCA modes of particles being retained : {num_dim}")
94+
sw_message(f"Variablity preserved: {str(float(cumDst[num_dim - 1]))}")
95+
96+
self.PCA_scores = PCA_scores
97+
self.eigen_vectors = eigen_vectors
98+
self.eigen_values = eigen_values
99+
return num_dim
100+
101+
def write_PCA(self, out_dir: Path, score_option="full", suffix="txt"):
102+
"""
103+
Write PCA data to a specified directory.
104+
105+
Parameters
106+
----------
107+
out_dir
108+
Directory in which to save PCA data
109+
score_option
110+
Option for how to save PCA scores. The full scores can be used to recreate the data used to create the
111+
model, which may be privileged information, so options are provided to save no information about the scores.
112+
Options are:
113+
full: Save complete scores
114+
Otherwise: Don't save scores
115+
suffix
116+
File extension to use
117+
"""
118+
out_dir = Path(out_dir)
119+
if not os.path.exists(out_dir):
120+
os.makedirs(out_dir)
121+
if score_option == "full":
122+
np.savetxt(str(out_dir / f'original_PCA_scores.{suffix}'), self.PCA_scores)
123+
124+
mean = np.mean(self.data_matrix, axis=0)
125+
np.savetxt(str(out_dir / f'mean.{suffix}'), mean)
126+
np.savetxt(str(out_dir / f'eigenvalues.{suffix}'), self.eigen_values)
127+
for i in range(self.data_matrix.shape[0]):
128+
nm = str(out_dir / f'pcamode{i}.{suffix}')
129+
data = self.eigen_vectors[:, i]
130+
data = data.reshape(self.data_matrix.shape[1:])
131+
np.savetxt(nm, data)
132+
133+
def project(self, PCA_instance):
134+
"""
135+
Maps a given set of scores to the data values (e.g., coordinate points) they represent, given the embedded
136+
PCA model.
137+
138+
Parameters
139+
----------
140+
PCA_instance
141+
A row vector containing one score for each PCA mode.
142+
143+
Returns
144+
-------
145+
data instance
146+
Data represented by the input scores for this PCA model
147+
148+
"""
149+
num_dim = len(PCA_instance)
150+
W = self.eigen_vectors[:, :num_dim].T
151+
data_instance = np.matmul(PCA_instance, W) + self.mean_data.reshape(-1)
152+
data_instance = data_instance.reshape(self.mean_data.shape)
153+
return data_instance
154+
155+
def load_PCA(self, mean_data, eigen_values, eigen_vectors, scores=None):
156+
"""
157+
Load PCA model from arrays
158+
159+
Parameters
160+
----------
161+
mean_data
162+
eigen_values
163+
eigen_vectors
164+
scores
165+
"""
166+
self.mean_data = mean_data
167+
self.eigen_values = eigen_values
168+
self.eigen_vectors = eigen_vectors
169+
self.PCA_scores = scores
170+
171+
@classmethod
172+
def from_directory(cls, directory: Path):
173+
"""
174+
Factory function to create a PCA_embedder instance by loading saved data from a specified directory.
175+
176+
Parameters
177+
----------
178+
directory
179+
Directory from which to load data
180+
181+
Returns
182+
-------
183+
embedder
184+
PCA_embedder instance
185+
186+
"""
187+
directory = Path(directory)
188+
189+
mean = np.loadtxt(glob(str(directory / "mean*"))[0])
190+
eigen_values = np.loadtxt(glob(str(directory / "eigenvalues*"))[0])
191+
eigen_vectors = []
192+
193+
eigen_vector_files = glob(str(directory / "pcamode*"))
194+
eigen_vector_files.sort(key=lambda f: int(str(Path(f).stem).split("pcamode")[-1])) # Sort numerically by name
195+
for file in eigen_vector_files:
196+
eigen_vector = np.flip(np.loadtxt(file))
197+
eigen_vectors.append(eigen_vector.reshape((-1)))
198+
199+
eigen_vectors = np.rot90(np.array(eigen_vectors))
200+
201+
embedder = cls()
202+
203+
scores = None
204+
if scores_glob := glob(str(directory / "original_PCA_scores*")):
205+
scores = np.loadtxt(scores_glob[0])
206+
207+
embedder.load_PCA(mean, eigen_values, eigen_vectors, scores=scores)
208+
209+
return embedder
210+
211+
def getEmbeddedMatrix(self):
212+
"""
213+
Get the embedded form of data_matrix
214+
215+
Returns
216+
-------
217+
PCA_scores
218+
A matrix with one row for each input data sample
219+
The columns are floats that represent the value for each PCA mode that together represent the input data
220+
sample.
221+
The number of columns indicates the number of PCA modes that were used to generate the scores.
222+
223+
"""
224+
return self.PCA_scores

Testing/PythonTests/PythonTests.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,10 @@ TEST(pythonTests, pcaTest) {
331331
run_test("pca.py");
332332
}
333333

334+
TEST(pythonTests, pcaEmbedderTest) {
335+
run_test("pcaembedder.py");
336+
}
337+
334338
TEST(pythonTests, findreferencemeshTest) {
335339
run_test("findReferenceMesh.py");
336340
}

0 commit comments

Comments
 (0)