66import numpy as np
77from abc import ABC , abstractmethod
88from shapeworks .utils import sw_message
9+ from pathlib import Path
10+ from glob import glob
911
1012# abstract base class for embedders
1113class Embedder (ABC ):
@@ -19,61 +21,204 @@ def project(self, PCA_instance):
1921
2022# instance of embedder that uses PCA for dimension reduction
2123class PCA_Embbeder (Embedder ):
22- # overriding abstract methods
23- def __init__ (self , data_matrix , num_dim = 0 , percent_variability = 0.95 ):
24- self .data_matrix = data_matrix
25- num_dim = self .run_PCA (num_dim , percent_variability )
26- self .num_dim = num_dim
27- # run PCA on data_matrix for PCA_Embedder
28- def run_PCA (self , num_dim , percent_variability ):
29- # get covariance matrix (uses compact trick)
30- N = self .data_matrix .shape [0 ]
31- data_matrix_2d = self .data_matrix .reshape (self .data_matrix .shape [0 ], - 1 ).T # flatten data instances and transpose
32- mean = np .mean (data_matrix_2d , axis = 1 )
33- centered_data_matrix_2d = (data_matrix_2d .T - mean ).T
34- trick_cov_matrix = np .dot (centered_data_matrix_2d .T ,centered_data_matrix_2d ) * 1.0 / np .sqrt (N - 1 )
35- # get eignevectors and eigenvalues
36- eigen_values , eigen_vectors = np .linalg .eigh (trick_cov_matrix )
37- eigen_vectors = np .dot (centered_data_matrix_2d , eigen_vectors )
38- for i in range (N ):
39- eigen_vectors [:,i ] = eigen_vectors [:,i ]/ np .linalg .norm (eigen_vectors [:,i ])
40- eigen_values = np .flip (eigen_values )
41- eigen_vectors = np .flip (eigen_vectors , 1 )
42- # get num PCA components
43- cumDst = np .cumsum (eigen_values ) / np .sum (eigen_values )
44- if num_dim == 0 :
45- cumDst = np .cumsum (eigen_values ) / np .sum (eigen_values )
46- num_dim = np .where (cumDst > float (percent_variability ))[0 ][0 ] + 1
47- W = eigen_vectors [:, :num_dim ]
48- PCA_scores = np .matmul (centered_data_matrix_2d .T , W )
49- sw_message (f"The PCA modes of particles being retained : { num_dim } " )
50- sw_message (f"Variablity preserved: { str (float (cumDst [num_dim - 1 ]))} " )
51- self .num_dim = num_dim
52- self .PCA_scores = PCA_scores
53- self .eigen_vectors = eigen_vectors
54- self .eigen_values = eigen_values
55- return num_dim
56- # write PCA info to files
57- # @TODO do we need all of this?
58- def write_PCA (self , out_dir , suffix ):
59- if not os .path .exists (out_dir ):
60- os .makedirs (out_dir )
61- np .save (out_dir + 'original_PCA_scores.npy' , self .PCA_scores )
62- mean = np .mean (self .data_matrix , axis = 0 )
63- np .savetxt (out_dir + 'mean.' + suffix , mean )
64- np .savetxt (out_dir + 'eigenvalues.txt' , self .eigen_values )
65- for i in range (self .data_matrix .shape [0 ]):
66- nm = out_dir + 'pcamode' + str (i ) + '.' + suffix
67- data = self .eigen_vectors [:, i ]
68- data = data .reshape (self .data_matrix .shape [1 :])
69- np .savetxt (nm , data )
70- # returns embedded form of dtat_matrix
71- def getEmbeddedMatrix (self ):
72- return self .PCA_scores
73- # projects embbed array into data
74- def project (self , PCA_instance ):
75- W = self .eigen_vectors [:, :self .num_dim ].T
76- mean = np .mean (self .data_matrix , axis = 0 )
77- data_instance = np .matmul (PCA_instance , W ) + mean .reshape (- 1 )
78- data_instance = data_instance .reshape ((self .data_matrix .shape [1 :]))
79- return data_instance
24+ def __init__ (self , data_matrix = None , num_dim = 0 , percent_variability = 0.95 ):
25+ """
26+ Initialize the PCA_Embedder. If data_matrix is provided, a PCA model is generated.
27+ Otherwise, the attributes defining the model are initialized as None. A model can then be initialized from arrays
28+ using load_pca.
29+
30+ Parameters
31+ ----------
32+ data_matrix
33+ Data to use to generate a PCA model
34+ num_dim
35+ Number of PCA dimensions to keep in the generated PCA scores. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
36+ modes of variation is one less than the number of samples used to build the model.
37+ If set to zero, the maximum number of dimensions are kept.
38+ percent_variability
39+ Percentage of the variation in the input data to keep in the generated PCA scores, scaled to between 0 and 1.
40+ This is only used if num_dim is not set.
41+ """
42+ super ().__init__ (data_matrix )
43+ self .PCA_scores = None
44+ self .eigen_vectors = None
45+ self .eigen_values = None
46+
47+ if data_matrix is not None :
48+ self .data_matrix = data_matrix
49+ self .mean_data = np .mean (self .data_matrix , axis = 0 )
50+ self .run_PCA (num_dim , percent_variability )
51+
52+ def run_PCA (self , num_dim , percent_variability ):
53+ """
54+ Perform principal component analysis on the data_matrix.
55+
56+ Parameters
57+ ----------
58+ num_dim
59+ Number of PCA dimensions to keep in the model. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
60+ modes of variation is one less than the number of samples used to build the model.
61+ If set to zero, the maximum number of dimensions are kept.
62+ percent_variability
63+ Percentage of the variation in the input data to keep in the model, scaled to between 0 and 1.
64+ This is only used if num_dim is not set.
65+
66+ Returns
67+ -------
68+ num_dim
69+ num_dim actually used
70+ """
71+ # get covariance matrix (uses compact trick)
72+ N = self .data_matrix .shape [0 ]
73+ data_matrix_2d = self .data_matrix .reshape (self .data_matrix .shape [0 ],
74+ - 1 ).T # flatten data instances and transpose
75+ mean = np .mean (data_matrix_2d , axis = 1 )
76+ centered_data_matrix_2d = (data_matrix_2d .T - mean ).T
77+ trick_cov_matrix = np .dot (centered_data_matrix_2d .T , centered_data_matrix_2d ) * 1.0 / np .sqrt (N - 1 )
78+ # get eignevectors and eigenvalues
79+ eigen_values , eigen_vectors = np .linalg .eigh (trick_cov_matrix )
80+ eigen_vectors = np .dot (centered_data_matrix_2d , eigen_vectors )
81+ for i in range (N ):
82+ eigen_vectors [:, i ] = eigen_vectors [:, i ] / np .linalg .norm (eigen_vectors [:, i ])
83+ eigen_values = np .flip (eigen_values )
84+ eigen_vectors = np .flip (eigen_vectors , 1 )
85+ # get num PCA components
86+ # Note that the number of the eigen_values and eigen_vectors is equal to the dimension of the data
87+ # matrix, but the last column is not used in the model because it describes no variation.
88+ cumDst = np .cumsum (eigen_values ) / np .sum (eigen_values )
89+ if num_dim == 0 :
90+ num_dim = np .where (cumDst >= float (percent_variability ))[0 ][0 ] + 1
91+ W = eigen_vectors [:, :num_dim ]
92+ PCA_scores = np .matmul (centered_data_matrix_2d .T , W )
93+ sw_message (f"The PCA modes of particles being retained : { num_dim } " )
94+ sw_message (f"Variablity preserved: { str (float (cumDst [num_dim - 1 ]))} " )
95+
96+ self .PCA_scores = PCA_scores
97+ self .eigen_vectors = eigen_vectors
98+ self .eigen_values = eigen_values
99+ return num_dim
100+
101+ def write_PCA (self , out_dir : Path , score_option = "full" , suffix = "txt" ):
102+ """
103+ Write PCA data to a specified directory.
104+
105+ Parameters
106+ ----------
107+ out_dir
108+ Directory in which to save PCA data
109+ score_option
110+ Option for how to save PCA scores. The full scores can be used to recreate the data used to create the
111+ model, which may be privileged information, so options are provided to save no information about the scores.
112+ Options are:
113+ full: Save complete scores
114+ Otherwise: Don't save scores
115+ suffix
116+ File extension to use
117+ """
118+ out_dir = Path (out_dir )
119+ if not os .path .exists (out_dir ):
120+ os .makedirs (out_dir )
121+ if score_option == "full" :
122+ np .savetxt (str (out_dir / f'original_PCA_scores.{ suffix } ' ), self .PCA_scores )
123+
124+ mean = np .mean (self .data_matrix , axis = 0 )
125+ np .savetxt (str (out_dir / f'mean.{ suffix } ' ), mean )
126+ np .savetxt (str (out_dir / f'eigenvalues.{ suffix } ' ), self .eigen_values )
127+ for i in range (self .data_matrix .shape [0 ]):
128+ nm = str (out_dir / f'pcamode{ i } .{ suffix } ' )
129+ data = self .eigen_vectors [:, i ]
130+ data = data .reshape (self .data_matrix .shape [1 :])
131+ np .savetxt (nm , data )
132+
133+ def project (self , PCA_instance ):
134+ """
135+ Maps a given set of scores to the data values (e.g., coordinate points) they represent, given the embedded
136+ PCA model.
137+
138+ Parameters
139+ ----------
140+ PCA_instance
141+ A row vector containing one score for each PCA mode.
142+
143+ Returns
144+ -------
145+ data instance
146+ Data represented by the input scores for this PCA model
147+
148+ """
149+ num_dim = len (PCA_instance )
150+ W = self .eigen_vectors [:, :num_dim ].T
151+ data_instance = np .matmul (PCA_instance , W ) + self .mean_data .reshape (- 1 )
152+ data_instance = data_instance .reshape (self .mean_data .shape )
153+ return data_instance
154+
155+ def load_PCA (self , mean_data , eigen_values , eigen_vectors , scores = None ):
156+ """
157+ Load PCA model from arrays
158+
159+ Parameters
160+ ----------
161+ mean_data
162+ eigen_values
163+ eigen_vectors
164+ scores
165+ """
166+ self .mean_data = mean_data
167+ self .eigen_values = eigen_values
168+ self .eigen_vectors = eigen_vectors
169+ self .PCA_scores = scores
170+
171+ @classmethod
172+ def from_directory (cls , directory : Path ):
173+ """
174+ Factory function to create a PCA_embedder instance by loading saved data from a specified directory.
175+
176+ Parameters
177+ ----------
178+ directory
179+ Directory from which to load data
180+
181+ Returns
182+ -------
183+ embedder
184+ PCA_embedder instance
185+
186+ """
187+ directory = Path (directory )
188+
189+ mean = np .loadtxt (glob (str (directory / "mean*" ))[0 ])
190+ eigen_values = np .loadtxt (glob (str (directory / "eigenvalues*" ))[0 ])
191+ eigen_vectors = []
192+
193+ eigen_vector_files = glob (str (directory / "pcamode*" ))
194+ eigen_vector_files .sort (key = lambda f : int (str (Path (f ).stem ).split ("pcamode" )[- 1 ])) # Sort numerically by name
195+ for file in eigen_vector_files :
196+ eigen_vector = np .flip (np .loadtxt (file ))
197+ eigen_vectors .append (eigen_vector .reshape ((- 1 )))
198+
199+ eigen_vectors = np .rot90 (np .array (eigen_vectors ))
200+
201+ embedder = cls ()
202+
203+ scores = None
204+ if scores_glob := glob (str (directory / "original_PCA_scores*" )):
205+ scores = np .loadtxt (scores_glob [0 ])
206+
207+ embedder .load_PCA (mean , eigen_values , eigen_vectors , scores = scores )
208+
209+ return embedder
210+
211+ def getEmbeddedMatrix (self ):
212+ """
213+ Get the embedded form of data_matrix
214+
215+ Returns
216+ -------
217+ PCA_scores
218+ A matrix with one row for each input data sample
219+ The columns are floats that represent the value for each PCA mode that together represent the input data
220+ sample.
221+ The number of columns indicates the number of PCA modes that were used to generate the scores.
222+
223+ """
224+ return self .PCA_scores
0 commit comments