66import numpy as np
77from abc import ABC , abstractmethod
88from shapeworks .utils import sw_message
9+ from pathlib import Path
10+ from glob import glob
911
10- # abstract base class for embedders
12+ # abstract base class for embedders
1113class Embedder (ABC ):
12- # abstract method
13- def __init__ (self , data_matrix ):
14- self .data_matrix = data_matrix
15- def getEmbeddedMatrix (self ):
16- pass
17- def project (self , PCA_instance ):
18- pass
19-
14+ # abstract method
15+ def __init__ (self , data_matrix ):
16+ self .data_matrix = data_matrix
17+ def getEmbeddedMatrix (self ):
18+ pass
19+ def project (self , PCA_instance ):
20+ pass
21+
2022# instance of embedder that uses PCA for dimension reduction
2123class PCA_Embbeder (Embedder ):
22- # overriding abstract methods
23- def __init__ (self , data_matrix , num_dim = 0 , percent_variability = 0.95 ):
24- self .data_matrix = data_matrix
25- num_dim = self .run_PCA (num_dim , percent_variability )
26- self .num_dim = num_dim
27- # run PCA on data_matrix for PCA_Embedder
28- def run_PCA (self , num_dim , percent_variability ):
29- # get covariance matrix (uses compact trick)
30- N = self .data_matrix .shape [0 ]
31- data_matrix_2d = self .data_matrix .reshape (self .data_matrix .shape [0 ], - 1 ).T # flatten data instances and transpose
32- mean = np .mean (data_matrix_2d , axis = 1 )
33- centered_data_matrix_2d = (data_matrix_2d .T - mean ).T
34- trick_cov_matrix = np .dot (centered_data_matrix_2d .T ,centered_data_matrix_2d ) * 1.0 / np .sqrt (N - 1 )
35- # get eignevectors and eigenvalues
36-
37- # Check if percent_variability is within valid range
38- if percent_variability < 0 or percent_variability > 100 :
39- percent_variability = 100
40-
41- eigen_values , eigen_vectors = np .linalg .eigh (trick_cov_matrix )
42- eigen_vectors = np .dot (centered_data_matrix_2d , eigen_vectors )
43- for i in range (N ):
44- eigen_vectors [:,i ] = eigen_vectors [:,i ]/ np .linalg .norm (eigen_vectors [:,i ])
45- eigen_values = np .flip (eigen_values )
46- eigen_vectors = np .flip (eigen_vectors , 1 )
47- # get num PCA components
48- cumDst = np .cumsum (eigen_values ) / np .sum (eigen_values )
49- if num_dim == 0 :
50- cumDst = np .cumsum (eigen_values ) / np .sum (eigen_values )
51- num_dim = np .where (cumDst > float (percent_variability ))
52- if num_dim and len (num_dim [0 ]) > 0 :
53- num_dim = num_dim [0 ][0 ] + 1
54- else :
55- num_dim = len (cumDst )
56- W = eigen_vectors [:, :num_dim ]
57- PCA_scores = np .matmul (centered_data_matrix_2d .T , W )
58- sw_message (f"The PCA modes of particles being retained : { num_dim } " )
59- sw_message (f"Variablity preserved: { str (float (cumDst [num_dim - 1 ]))} " )
60- self .num_dim = num_dim
61- self .PCA_scores = PCA_scores
62- self .eigen_vectors = eigen_vectors
63- self .eigen_values = eigen_values
64- return num_dim
65-
66- # write PCA info to files
67- # @TODO do we need all of this?
68- def write_PCA (self , out_dir , suffix ):
69- if not os .path .exists (out_dir ):
70- os .makedirs (out_dir )
71- np .save (out_dir + 'original_PCA_scores.npy' , self .PCA_scores )
72- mean = np .mean (self .data_matrix , axis = 0 )
73- np .savetxt (out_dir + 'mean.' + suffix , mean )
74- np .savetxt (out_dir + 'eigenvalues.txt' , self .eigen_values )
75- for i in range (self .data_matrix .shape [0 ]):
76- nm = out_dir + 'pcamode' + str (i ) + '.' + suffix
77- data = self .eigen_vectors [:, i ]
78- data = data .reshape (self .data_matrix .shape [1 :])
79- np .savetxt (nm , data )
80- # returns embedded form of dtat_matrix
81- def getEmbeddedMatrix (self ):
82- return self .PCA_scores
83- # projects embbed array into data
84- def project (self , PCA_instance ):
85- W = self .eigen_vectors [:, :self .num_dim ].T
86- mean = np .mean (self .data_matrix , axis = 0 )
87- data_instance = np .matmul (PCA_instance , W ) + mean .reshape (- 1 )
88- data_instance = data_instance .reshape ((self .data_matrix .shape [1 :]))
89- return data_instance
24+ def __init__ (self , data_matrix = None , num_dim = 0 , percent_variability = 0.95 ):
25+ """
26+ Initialize the PCA_Embedder. If data_matrix is provided, a PCA model is generated.
27+ Otherwise, the attributes defining the model are initialized as None. A model can then be initialized from arrays
28+ using load_pca.
29+
30+ Parameters
31+ ----------
32+ data_matrix
33+ Data to use to generate a PCA model
34+ num_dim
35+ Number of PCA dimensions to keep in the generated PCA scores. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
36+ modes of variation is one less than the number of samples used to build the model.
37+ If set to zero, the maximum number of dimensions are kept.
38+ percent_variability
39+ Percentage of the variation in the input data to keep in the generated PCA scores, scaled to between 0 and 1.
40+ This is only used if num_dim is not set.
41+ """
42+ super ().__init__ (data_matrix )
43+ self .PCA_scores = None
44+ self .eigen_vectors = None
45+ self .eigen_values = None
46+
47+ if data_matrix is not None :
48+ self .data_matrix = data_matrix
49+ self .mean_data = np .mean (self .data_matrix , axis = 0 )
50+ self .run_PCA (num_dim , percent_variability )
51+
52+ def run_PCA (self , num_dim , percent_variability ):
53+ """
54+ Perform principal component analysis on the data_matrix.
55+
56+ Parameters
57+ ----------
58+ num_dim
59+ Number of PCA dimensions to keep in the model. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
60+ modes of variation is one less than the number of samples used to build the model.
61+ If set to zero, the maximum number of dimensions are kept.
62+ percent_variability
63+ Percentage of the variation in the input data to keep in the model, scaled to between 0 and 1.
64+ This is only used if num_dim is not set.
65+
66+ Returns
67+ -------
68+ num_dim
69+ num_dim actually used
70+ """
71+ # get covariance matrix (uses compact trick)
72+ N = self .data_matrix .shape [0 ]
73+ data_matrix_2d = self .data_matrix .reshape (self .data_matrix .shape [0 ],
74+ - 1 ).T # flatten data instances and transpose
75+ mean = np .mean (data_matrix_2d , axis = 1 )
76+ centered_data_matrix_2d = (data_matrix_2d .T - mean ).T
77+ trick_cov_matrix = np .dot (centered_data_matrix_2d .T , centered_data_matrix_2d ) * 1.0 / np .sqrt (N - 1 )
78+ # get eignevectors and eigenvalues
79+
80+ # Check if percent_variability is within valid range
81+ if percent_variability < 0 or percent_variability > 100 :
82+ percent_variability = 100
83+
84+ eigen_values , eigen_vectors = np .linalg .eigh (trick_cov_matrix )
85+ eigen_vectors = np .dot (centered_data_matrix_2d , eigen_vectors )
86+ for i in range (N ):
87+ eigen_vectors [:, i ] = eigen_vectors [:, i ] / np .linalg .norm (eigen_vectors [:, i ])
88+ eigen_values = np .flip (eigen_values )
89+ eigen_vectors = np .flip (eigen_vectors , 1 )
90+ # get num PCA components
91+ # Note that the number of the eigen_values and eigen_vectors is equal to the dimension of the data
92+ # matrix, but the last column is not used in the model because it describes no variation.
93+ cumDst = np .cumsum (eigen_values ) / np .sum (eigen_values )
94+ if num_dim == 0 :
95+ cumDst = np .cumsum (eigen_values ) / np .sum (eigen_values )
96+ num_dim = np .where (cumDst >= float (percent_variability ))
97+ if num_dim and len (num_dim [0 ]) > 0 :
98+ num_dim = num_dim [0 ][0 ] + 1
99+ else :
100+ num_dim = len (cumDst )
101+ W = eigen_vectors [:, :num_dim ]
102+ PCA_scores = np .matmul (centered_data_matrix_2d .T , W )
103+ sw_message (f"The PCA modes of particles being retained : { num_dim } " )
104+ sw_message (f"Variablity preserved: { str (float (cumDst [num_dim - 1 ]))} " )
105+
106+ self .PCA_scores = PCA_scores
107+ self .eigen_vectors = eigen_vectors
108+ self .eigen_values = eigen_values
109+ self .num_dim = num_dim
110+ return num_dim
111+
112+ def write_PCA (self , out_dir : Path , score_option = "full" , suffix = "txt" ):
113+ """
114+ Write PCA data to a specified directory.
115+
116+ Parameters
117+ ----------
118+ out_dir
119+ Directory in which to save PCA data
120+ score_option
121+ Option for how to save PCA scores. The full scores can be used to recreate the data used to create the
122+ model, which may be privileged information, so options are provided to save no information about the scores.
123+ Options are:
124+ full: Save complete scores
125+ Otherwise: Don't save scores
126+ suffix
127+ File extension to use
128+ """
129+ out_dir = Path (out_dir )
130+ if not os .path .exists (out_dir ):
131+ os .makedirs (out_dir )
132+ if score_option == "full" :
133+ np .savetxt (str (out_dir / f'original_PCA_scores.{ suffix } ' ), self .PCA_scores )
134+ mean = np .mean (self .data_matrix , axis = 0 )
135+ np .savetxt (str (out_dir / f'mean.{ suffix } ' ), mean )
136+ np .savetxt (str (out_dir / f'eigenvalues.txt' ), self .eigen_values )
137+ for i in range (self .data_matrix .shape [0 ]):
138+ nm = str (out_dir / f'pcamode{ i } .{ suffix } ' )
139+ data = self .eigen_vectors [:, i ]
140+ data = data .reshape (self .data_matrix .shape [1 :])
141+ np .savetxt (nm , data )
142+
143+ def project (self , PCA_instance ):
144+ """
145+ Maps a given set of scores to the data values (e.g., coordinate points) they represent, given the embedded
146+ PCA model.
147+
148+ Parameters
149+ ----------
150+ PCA_instance
151+ A row vector containing one score for each PCA mode.
152+
153+ Returns
154+ -------
155+ data instance
156+ Data represented by the input scores for this PCA model
157+
158+ """
159+ num_dim = len (PCA_instance )
160+ W = self .eigen_vectors [:, :num_dim ].T
161+ data_instance = np .matmul (PCA_instance , W ) + self .mean_data .reshape (- 1 )
162+ data_instance = data_instance .reshape (self .mean_data .shape )
163+ return data_instance
164+
165+ def load_PCA (self , mean_data , eigen_values , eigen_vectors , scores = None ):
166+ """
167+ Load PCA model from arrays
168+
169+ Parameters
170+ ----------
171+ mean_data
172+ eigen_values
173+ eigen_vectors
174+ scores
175+ """
176+ self .mean_data = mean_data
177+ self .eigen_values = eigen_values
178+ self .eigen_vectors = eigen_vectors
179+ self .PCA_scores = scores
180+
181+ @classmethod
182+ def from_directory (cls , directory : Path ):
183+ """
184+ Factory function to create a PCA_embedder instance by loading saved data from a specified directory.
185+
186+ Parameters
187+ ----------
188+ directory
189+ Directory from which to load data
190+
191+ Returns
192+ -------
193+ embedder
194+ PCA_embedder instance
195+
196+ """
197+ directory = Path (directory )
198+
199+ mean = np .loadtxt (glob (str (directory / "mean*" ))[0 ])
200+ eigen_values = np .loadtxt (glob (str (directory / "eigenvalues*" ))[0 ])
201+ eigen_vectors = []
202+
203+ eigen_vector_files = glob (str (directory / "pcamode*" ))
204+ eigen_vector_files .sort (key = lambda f : int (str (Path (f ).stem ).split ("pcamode" )[- 1 ])) # Sort numerically by name
205+ for file in eigen_vector_files :
206+ eigen_vector = np .flip (np .loadtxt (file ))
207+ eigen_vectors .append (eigen_vector .reshape ((- 1 )))
208+
209+ eigen_vectors = np .rot90 (np .array (eigen_vectors ))
210+
211+ embedder = cls ()
212+
213+ scores = None
214+ if scores_glob := glob (str (directory / "original_PCA_scores*" )):
215+ scores = np .loadtxt (scores_glob [0 ])
216+
217+ embedder .load_PCA (mean , eigen_values , eigen_vectors , scores = scores )
218+
219+ return embedder
220+
221+ def getEmbeddedMatrix (self ):
222+ """
223+ Get the embedded form of data_matrix
224+
225+ Returns
226+ -------
227+ PCA_scores
228+ A matrix with one row for each input data sample
229+ The columns are floats that represent the value for each PCA mode that together represent the input data
230+ sample.
231+ The number of columns indicates the number of PCA modes that were used to generate the scores.
232+
233+ """
234+ return self .PCA_scores
0 commit comments