Skip to content

Commit fcf9209

Browse files
authored
Merge pull request #2222 from SCIInstitute/acreegan
Update PCA_Embedder Saving and Loading #2204
2 parents 8a8530e + 8bf581a commit fcf9209

File tree

4 files changed

+438
-78
lines changed

4 files changed

+438
-78
lines changed

Python/DataAugmentationUtilsPackage/DataAugmentationUtils/DataAugmentation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def point_based_aug(out_dir, orig_img_list, orig_point_list, num_samples, num_di
3636
else:
3737
PointEmbedder = Embedder.PCA_Embbeder(point_matrix, num_dim, percent_variability)
3838
num_dim = PointEmbedder.num_dim
39-
PointEmbedder.write_PCA(out_dir + "PCA_Particle_Info/", "particles") # write PCA info for DeepSSM testing
39+
PointEmbedder.write_PCA(out_dir + "PCA_Particle_Info/", suffix="particles") # write PCA info for DeepSSM testing
4040
embedded_matrix = PointEmbedder.getEmbeddedMatrix()
4141
# Get sampler
4242
if sampler_type == "gaussian":

Python/DataAugmentationUtilsPackage/DataAugmentationUtils/Embedder.py

Lines changed: 222 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -6,84 +6,229 @@
66
import numpy as np
77
from abc import ABC, abstractmethod
88
from shapeworks.utils import sw_message
9+
from pathlib import Path
10+
from glob import glob
911

10-
# abstract base class for embedders
12+
# abstract base class for embedders
1113
class Embedder(ABC):
12-
# abstract method
13-
def __init__(self, data_matrix):
14-
self.data_matrix = data_matrix
15-
def getEmbeddedMatrix(self):
16-
pass
17-
def project(self, PCA_instance):
18-
pass
19-
14+
# abstract method
15+
def __init__(self, data_matrix):
16+
self.data_matrix = data_matrix
17+
def getEmbeddedMatrix(self):
18+
pass
19+
def project(self, PCA_instance):
20+
pass
21+
2022
# instance of embedder that uses PCA for dimension reduction
2123
class PCA_Embbeder(Embedder):
22-
# overriding abstract methods
23-
def __init__(self, data_matrix, num_dim=0, percent_variability=0.95):
24-
self.data_matrix = data_matrix
25-
num_dim = self.run_PCA(num_dim, percent_variability)
26-
self.num_dim = num_dim
27-
# run PCA on data_matrix for PCA_Embedder
28-
def run_PCA(self, num_dim, percent_variability):
29-
# get covariance matrix (uses compact trick)
30-
N = self.data_matrix.shape[0]
31-
data_matrix_2d = self.data_matrix.reshape(self.data_matrix.shape[0], -1).T # flatten data instances and transpose
32-
mean = np.mean(data_matrix_2d, axis=1)
33-
centered_data_matrix_2d = (data_matrix_2d.T - mean).T
34-
trick_cov_matrix = np.dot(centered_data_matrix_2d.T,centered_data_matrix_2d) * 1.0/np.sqrt(N-1)
35-
# get eignevectors and eigenvalues
36-
37-
# Check if percent_variability is within valid range
38-
if percent_variability < 0 or percent_variability > 100:
39-
percent_variability = 100
40-
41-
eigen_values, eigen_vectors = np.linalg.eigh(trick_cov_matrix)
42-
eigen_vectors = np.dot(centered_data_matrix_2d, eigen_vectors)
43-
for i in range(N):
44-
eigen_vectors[:,i] = eigen_vectors[:,i]/np.linalg.norm(eigen_vectors[:,i])
45-
eigen_values = np.flip(eigen_values)
46-
eigen_vectors = np.flip(eigen_vectors, 1)
47-
# get num PCA components
48-
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
49-
if num_dim == 0:
50-
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
51-
num_dim = np.where(cumDst > float(percent_variability))
52-
if num_dim and len(num_dim[0]) > 0:
53-
num_dim = num_dim[0][0] + 1
54-
else:
55-
num_dim = len(cumDst)
56-
W = eigen_vectors[:, :num_dim]
57-
PCA_scores = np.matmul(centered_data_matrix_2d.T, W)
58-
sw_message(f"The PCA modes of particles being retained : {num_dim}")
59-
sw_message(f"Variablity preserved: {str(float(cumDst[num_dim-1]))}")
60-
self.num_dim = num_dim
61-
self.PCA_scores = PCA_scores
62-
self.eigen_vectors = eigen_vectors
63-
self.eigen_values = eigen_values
64-
return num_dim
65-
66-
# write PCA info to files
67-
# @TODO do we need all of this?
68-
def write_PCA(self, out_dir, suffix):
69-
if not os.path.exists(out_dir):
70-
os.makedirs(out_dir)
71-
np.save(out_dir + 'original_PCA_scores.npy', self.PCA_scores)
72-
mean = np.mean(self.data_matrix, axis=0)
73-
np.savetxt(out_dir + 'mean.' + suffix, mean)
74-
np.savetxt(out_dir + 'eigenvalues.txt', self.eigen_values)
75-
for i in range(self.data_matrix.shape[0]):
76-
nm = out_dir + 'pcamode' + str(i) + '.' + suffix
77-
data = self.eigen_vectors[:, i]
78-
data = data.reshape(self.data_matrix.shape[1:])
79-
np.savetxt(nm, data)
80-
# returns embedded form of dtat_matrix
81-
def getEmbeddedMatrix(self):
82-
return self.PCA_scores
83-
# projects embbed array into data
84-
def project(self, PCA_instance):
85-
W = self.eigen_vectors[:, :self.num_dim].T
86-
mean = np.mean(self.data_matrix, axis=0)
87-
data_instance = np.matmul(PCA_instance, W) + mean.reshape(-1)
88-
data_instance = data_instance.reshape((self.data_matrix.shape[1:]))
89-
return data_instance
24+
def __init__(self, data_matrix=None, num_dim=0, percent_variability=0.95):
25+
"""
26+
Initialize the PCA_Embedder. If data_matrix is provided, a PCA model is generated.
27+
Otherwise, the attributes defining the model are initialized as None. A model can then be initialized from arrays
28+
using load_pca.
29+
30+
Parameters
31+
----------
32+
data_matrix
33+
Data to use to generate a PCA model
34+
num_dim
35+
Number of PCA dimensions to keep in the generated PCA scores. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
36+
modes of variation is one less than the number of samples used to build the model.
37+
If set to zero, the maximum number of dimensions are kept.
38+
percent_variability
39+
Percentage of the variation in the input data to keep in the generated PCA scores, scaled to between 0 and 1.
40+
This is only used if num_dim is not set.
41+
"""
42+
super().__init__(data_matrix)
43+
self.PCA_scores = None
44+
self.eigen_vectors = None
45+
self.eigen_values = None
46+
47+
if data_matrix is not None:
48+
self.data_matrix = data_matrix
49+
self.mean_data = np.mean(self.data_matrix, axis=0)
50+
self.run_PCA(num_dim, percent_variability)
51+
52+
def run_PCA(self, num_dim, percent_variability):
53+
"""
54+
Perform principal component analysis on the data_matrix.
55+
56+
Parameters
57+
----------
58+
num_dim
59+
Number of PCA dimensions to keep in the model. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
60+
modes of variation is one less than the number of samples used to build the model.
61+
If set to zero, the maximum number of dimensions are kept.
62+
percent_variability
63+
Percentage of the variation in the input data to keep in the model, scaled to between 0 and 1.
64+
This is only used if num_dim is not set.
65+
66+
Returns
67+
-------
68+
num_dim
69+
num_dim actually used
70+
"""
71+
# get covariance matrix (uses compact trick)
72+
N = self.data_matrix.shape[0]
73+
data_matrix_2d = self.data_matrix.reshape(self.data_matrix.shape[0],
74+
-1).T # flatten data instances and transpose
75+
mean = np.mean(data_matrix_2d, axis=1)
76+
centered_data_matrix_2d = (data_matrix_2d.T - mean).T
77+
trick_cov_matrix = np.dot(centered_data_matrix_2d.T, centered_data_matrix_2d) * 1.0 / np.sqrt(N - 1)
78+
# get eignevectors and eigenvalues
79+
80+
# Check if percent_variability is within valid range
81+
if percent_variability < 0 or percent_variability > 100:
82+
percent_variability = 100
83+
84+
eigen_values, eigen_vectors = np.linalg.eigh(trick_cov_matrix)
85+
eigen_vectors = np.dot(centered_data_matrix_2d, eigen_vectors)
86+
for i in range(N):
87+
eigen_vectors[:, i] = eigen_vectors[:, i] / np.linalg.norm(eigen_vectors[:, i])
88+
eigen_values = np.flip(eigen_values)
89+
eigen_vectors = np.flip(eigen_vectors, 1)
90+
# get num PCA components
91+
# Note that the number of the eigen_values and eigen_vectors is equal to the dimension of the data
92+
# matrix, but the last column is not used in the model because it describes no variation.
93+
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
94+
if num_dim == 0:
95+
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
96+
num_dim = np.where(cumDst >= float(percent_variability))
97+
if num_dim and len(num_dim[0]) > 0:
98+
num_dim = num_dim[0][0] + 1
99+
else:
100+
num_dim = len(cumDst)
101+
W = eigen_vectors[:, :num_dim]
102+
PCA_scores = np.matmul(centered_data_matrix_2d.T, W)
103+
sw_message(f"The PCA modes of particles being retained : {num_dim}")
104+
sw_message(f"Variablity preserved: {str(float(cumDst[num_dim - 1]))}")
105+
106+
self.PCA_scores = PCA_scores
107+
self.eigen_vectors = eigen_vectors
108+
self.eigen_values = eigen_values
109+
self.num_dim = num_dim
110+
return num_dim
111+
112+
def write_PCA(self, out_dir: Path, score_option="full", suffix="txt"):
113+
"""
114+
Write PCA data to a specified directory.
115+
116+
Parameters
117+
----------
118+
out_dir
119+
Directory in which to save PCA data
120+
score_option
121+
Option for how to save PCA scores. The full scores can be used to recreate the data used to create the
122+
model, which may be privileged information, so options are provided to save no information about the scores.
123+
Options are:
124+
full: Save complete scores
125+
Otherwise: Don't save scores
126+
suffix
127+
File extension to use
128+
"""
129+
out_dir = Path(out_dir)
130+
if not os.path.exists(out_dir):
131+
os.makedirs(out_dir)
132+
if score_option == "full":
133+
np.savetxt(str(out_dir / f'original_PCA_scores.{suffix}'), self.PCA_scores)
134+
mean = np.mean(self.data_matrix, axis=0)
135+
np.savetxt(str(out_dir / f'mean.{suffix}'), mean)
136+
np.savetxt(str(out_dir / f'eigenvalues.txt'), self.eigen_values)
137+
for i in range(self.data_matrix.shape[0]):
138+
nm = str(out_dir / f'pcamode{i}.{suffix}')
139+
data = self.eigen_vectors[:, i]
140+
data = data.reshape(self.data_matrix.shape[1:])
141+
np.savetxt(nm, data)
142+
143+
def project(self, PCA_instance):
144+
"""
145+
Maps a given set of scores to the data values (e.g., coordinate points) they represent, given the embedded
146+
PCA model.
147+
148+
Parameters
149+
----------
150+
PCA_instance
151+
A row vector containing one score for each PCA mode.
152+
153+
Returns
154+
-------
155+
data instance
156+
Data represented by the input scores for this PCA model
157+
158+
"""
159+
num_dim = len(PCA_instance)
160+
W = self.eigen_vectors[:, :num_dim].T
161+
data_instance = np.matmul(PCA_instance, W) + self.mean_data.reshape(-1)
162+
data_instance = data_instance.reshape(self.mean_data.shape)
163+
return data_instance
164+
165+
def load_PCA(self, mean_data, eigen_values, eigen_vectors, scores=None):
166+
"""
167+
Load PCA model from arrays
168+
169+
Parameters
170+
----------
171+
mean_data
172+
eigen_values
173+
eigen_vectors
174+
scores
175+
"""
176+
self.mean_data = mean_data
177+
self.eigen_values = eigen_values
178+
self.eigen_vectors = eigen_vectors
179+
self.PCA_scores = scores
180+
181+
@classmethod
182+
def from_directory(cls, directory: Path):
183+
"""
184+
Factory function to create a PCA_embedder instance by loading saved data from a specified directory.
185+
186+
Parameters
187+
----------
188+
directory
189+
Directory from which to load data
190+
191+
Returns
192+
-------
193+
embedder
194+
PCA_embedder instance
195+
196+
"""
197+
directory = Path(directory)
198+
199+
mean = np.loadtxt(glob(str(directory / "mean*"))[0])
200+
eigen_values = np.loadtxt(glob(str(directory / "eigenvalues*"))[0])
201+
eigen_vectors = []
202+
203+
eigen_vector_files = glob(str(directory / "pcamode*"))
204+
eigen_vector_files.sort(key=lambda f: int(str(Path(f).stem).split("pcamode")[-1])) # Sort numerically by name
205+
for file in eigen_vector_files:
206+
eigen_vector = np.flip(np.loadtxt(file))
207+
eigen_vectors.append(eigen_vector.reshape((-1)))
208+
209+
eigen_vectors = np.rot90(np.array(eigen_vectors))
210+
211+
embedder = cls()
212+
213+
scores = None
214+
if scores_glob := glob(str(directory / "original_PCA_scores*")):
215+
scores = np.loadtxt(scores_glob[0])
216+
217+
embedder.load_PCA(mean, eigen_values, eigen_vectors, scores=scores)
218+
219+
return embedder
220+
221+
def getEmbeddedMatrix(self):
222+
"""
223+
Get the embedded form of data_matrix
224+
225+
Returns
226+
-------
227+
PCA_scores
228+
A matrix with one row for each input data sample
229+
The columns are floats that represent the value for each PCA mode that together represent the input data
230+
sample.
231+
The number of columns indicates the number of PCA modes that were used to generate the scores.
232+
233+
"""
234+
return self.PCA_scores

Testing/PythonTests/PythonTests.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,10 @@ TEST(pythonTests, pcaTest) {
331331
run_test("pca.py");
332332
}
333333

334+
TEST(pythonTests, pcaEmbedderTest) {
335+
run_test("pcaembedder.py");
336+
}
337+
334338
TEST(pythonTests, findreferencemeshTest) {
335339
run_test("findReferenceMesh.py");
336340
}

0 commit comments

Comments
 (0)