@@ -20,7 +20,7 @@ def scan(seqs, meme_file, group_col="Group", pthresh=1e-3, rc=True):
2020 rc (bool): Whether to scan the sequence reverse complement as well
2121
2222 Returns:
23- pd.DataFrame containing columns 'Matrix_id ', 'seq_id ', 'start', 'end', 'strand'.
23+ pd.DataFrame containing columns 'MotifID ', 'SeqID ', 'start', 'end', 'strand'.
2424 """
2525 from collections import defaultdict
2626
@@ -137,101 +137,256 @@ def nmf(counts, seqs, reference_group, group_col="Group", n_components=10):
137137 return W , H , res
138138
139139
140- def motif_combinations (
141- counts ,
140+ def get_motif_pairs (sites ):
141+ """
142+ List the pairs of motifs present in each sequence.
143+
144+ Args:
145+ sites (pd.DataFrame): Pandas dataframe containing FIMO output.
146+
147+ Returns:
148+ pairs (pd.DataFrame): Dataframe containing all motif pairs in each
149+ sequence with their orientation and distance.
150+ """
151+ # Get midpoint of motif
152+ sites ["mid" ] = sites ["start" ] + ((sites ["end" ] - sites ["start" ]) / 2 )
153+
154+ # List the motif, strand and position for each site
155+ df = sites [["SeqID" ]].copy ()
156+ df ["data" ] = sites [["MotifID" , "strand" , "mid" ]].apply (
157+ lambda row : row .tolist (), axis = 1
158+ )
159+
160+ # Get all pairs of sites present in each sequence
161+ pairs = pd .DataFrame (
162+ df .groupby ("SeqID" )["data" ].apply (lambda x : list (itertools .combinations (x , 2 ))),
163+ columns = ["data" ],
164+ )
165+ pairs = pairs .explode ("data" )
166+
167+ # Get the orientation and distance for each pair of motifs
168+ pairs = pd .DataFrame (
169+ pairs .data .apply (lambda x : list (zip (* x ))).tolist (),
170+ index = pairs .index ,
171+ columns = ["MotifID" , "strand" , "pos" ],
172+ )
173+ pairs ["orientation" ] = pairs .strand .apply (
174+ lambda x : "same" if len (set (x )) == 1 else "opposite"
175+ )
176+ pairs ["distance" ] = pairs .pos .apply (lambda x : np .abs (x [1 ] - x [0 ]))
177+ return pairs [["MotifID" , "orientation" , "distance" ]]
178+
179+
180+ def _filter_motif_pairs (
181+ motif_pairs ,
182+ seqs ,
183+ reference_group = None ,
184+ group_col = "Group" ,
185+ min_prop_cutoff = 0 ,
186+ max_prop_cutoff = 0 ,
187+ ref_prop_cutoff = 0 ,
188+ ):
189+ # Count occurrence of motif pairs in each group
190+ cts = motif_pairs [["MotifID" , group_col ]].reset_index ().drop_duplicates ()
191+ cts = cts [["MotifID" , group_col ]].value_counts ().reset_index (name = "count" )
192+ cts ["group_total" ] = seqs [group_col ].value_counts ()[cts [group_col ]].tolist ()
193+ cts ["group_prop" ] = cts ["count" ] / cts .group_total
194+ cts = cts .pivot_table (
195+ index = "MotifID" , columns = group_col , values = "group_prop"
196+ ).fillna (0 )
197+
198+ # Filter rare pairs by proportion
199+ sel_pairs = set (
200+ cts .index [(cts .max (1 ) >= max_prop_cutoff ) & (cts .min (1 ) >= min_prop_cutoff )]
201+ )
202+ if ref_prop_cutoff > 0 :
203+ assert reference_group is not None
204+ sel_pairs = sel_pairs .intersection (
205+ cts .index [cts [reference_group ] >= ref_prop_cutoff ]
206+ )
207+
208+ print (f"Selected { len (sel_pairs )} pairs based on in-group proportion" )
209+ motif_pairs = motif_pairs [motif_pairs .MotifID .isin (sel_pairs )]
210+
211+ return motif_pairs .copy ()
212+
213+
214+ def motif_pair_differential_abundance (
215+ motif_pairs ,
142216 seqs ,
143217 reference_group ,
144218 group_col = "Group" ,
145- min_group_freq = 10 ,
146- min_group_prop = None ,
219+ max_prop_cutoff = 0 ,
220+ min_prop_cutoff = 0 ,
221+ ref_prop_cutoff = 0 ,
147222):
148223 """
149- Count occurences of pairwise combinations of motifs and compare between groups
224+ Compare the rate of occurence of pairwise combinations of motifs between groups
150225
151226 Args:
152- counts (pd.DataFrame): Pandas dataframe containing the motif count matrix.
153- Rows should be sequences and columns should be motifs .
227+ motif_pairs (pd.DataFrame): Pandas dataframe containing the ouptut of
228+ get_motif_pairs .
154229 seqs (pd.DataFrame): Pandas dataframe containing sequences
155230 reference_group (str): ID of group to use as reference
156231 group_col (str): Name of column in `seqs` containing group IDs
157- min_group_freq (int): Limit to combinations with this number of occurences in
232+ max_prop_cutoff (int): Limit to combinations with this proportion in
158233 at least one group.
159- min_group_prop (float): Limit to combinations with this proportion of occurences
160- in at least one group .
234+ min_prop_cutoff (float): Limit to combinations with this proportion in
235+ in all groups .
161236
162237 Returns:
163238 res (pd.DataFrame): Pandas dataframe containing FDR-corrected significance
164239 testing results for the occurrence of pairwise combinations between groups
165240 """
241+ res = pd .DataFrame ()
242+ motif_pairs = motif_pairs .merge (
243+ seqs [[group_col ]], left_index = True , right_index = True
244+ )
166245
167- print ("Listing motif combinations" )
246+ if (max_prop_cutoff > 0 ) or (min_prop_cutoff > 0 ) or (ref_prop_cutoff > 0 ):
247+ motif_pairs = _filter_motif_pairs (
248+ motif_pairs ,
249+ seqs ,
250+ reference_group = reference_group ,
251+ group_col = group_col ,
252+ max_prop_cutoff = max_prop_cutoff ,
253+ min_prop_cutoff = min_prop_cutoff ,
254+ ref_prop_cutoff = ref_prop_cutoff ,
255+ )
168256
169- # Get the complete set of motifs present in each sequence
170- motif_combinations = pd .DataFrame (
171- counts .apply (lambda row : set (counts .columns [np .where (row > 0 )[0 ]]), axis = 1 )
172- )
173- motif_combinations .columns = ["motifs" ]
174- motif_combinations = motif_combinations .merge (
257+ df = seqs [[group_col ]].copy ()
258+ for pair in motif_pairs .MotifID .unique ():
259+ seqs_with_pair = motif_pairs [motif_pairs .MotifID == pair ].index
260+ df ["has_pair" ] = df .index .isin (seqs_with_pair )
261+ curr_res = groupwise_fishers (
262+ df ,
263+ reference_group = reference_group ,
264+ val_col = "has_pair" ,
265+ reference_val = None ,
266+ group_col = group_col ,
267+ ).reset_index ()
268+ curr_res ["MotifID" ] = [pair ] * len (curr_res )
269+ res = pd .concat ([res , curr_res ])
270+
271+ # FDR correction
272+ res ["padj" ] = fdrcorrection (res .pval )[1 ]
273+
274+ return res .reset_index (drop = True )
275+
276+
277+ def motif_pair_differential_orientation (
278+ motif_pairs ,
279+ seqs ,
280+ reference_group ,
281+ group_col = "Group" ,
282+ max_prop_cutoff = 0 ,
283+ min_prop_cutoff = 0 ,
284+ ref_prop_cutoff = 0 ,
285+ ):
286+ """
287+ Compare the mutual orientation of all motif pairs between groups.
288+
289+ Args:
290+ motif_pairs (pd.DataFrame): Pandas dataframe containing the ouptut of
291+ get_motif_pairs.
292+ seqs (pd.DataFrame): Pandas dataframe containing sequences
293+ reference_group (str): ID of group to use as reference
294+ group_col (str): Name of column in `seqs` containing group IDs
295+ max_prop_cutoff (int): Limit to combinations with this proportion in
296+ at least one group.
297+ min_prop_cutoff (float): Limit to combinations with this proportion in
298+ in all groups.
299+
300+ Returns:
301+ res (pd.DataFrame): Pandas dataframe containing FDR-corrected significance
302+ testing results for the mutual orientation of pairwise combinations
303+ between groups
304+
305+ """
306+ res = pd .DataFrame ()
307+ motif_pairs = motif_pairs .merge (
175308 seqs [[group_col ]], left_index = True , right_index = True
176309 )
177310
178- # Get pairwise combinations present in each sequence
179- motif_combinations ["combination" ] = motif_combinations .motifs .apply (
180- lambda x : list (itertools .combinations (x , 2 ))
181- )
182- motif_combinations = motif_combinations [["combination" , group_col ]].explode (
183- "combination"
184- )
311+ if (max_prop_cutoff > 0 ) or (min_prop_cutoff > 0 ) or (ref_prop_cutoff > 0 ):
312+ motif_pairs = _filter_motif_pairs (
313+ motif_pairs ,
314+ seqs ,
315+ reference_group = reference_group ,
316+ group_col = group_col ,
317+ max_prop_cutoff = max_prop_cutoff ,
318+ min_prop_cutoff = min_prop_cutoff ,
319+ ref_prop_cutoff = ref_prop_cutoff ,
320+ )
185321
186- print ("Making count matrix" )
187- # Count the number of sequences in which each motif combination is present
188- cts = (
189- motif_combinations [["combination" , group_col ]]
190- .value_counts ()
191- .reset_index (name = "count" )
192- )
322+ for pair in motif_pairs .MotifID .unique ():
323+ curr_res = groupwise_fishers (
324+ motif_pairs [motif_pairs .MotifID == pair ],
325+ reference_group = reference_group ,
326+ val_col = "orientation" ,
327+ reference_val = "same" ,
328+ group_col = group_col ,
329+ ).reset_index ()
330+ curr_res ["MotifID" ] = [pair ] * len (curr_res )
331+ res = pd .concat ([res , curr_res ])
193332
194- print ("Filtering" )
195- # Drop rare combinations
196- if min_group_freq is not None :
197- comb_max = cts .groupby ("combination" )["count" ].max ()
198- sel_comb = cts .combination [
199- cts .combination .isin (comb_max [comb_max > min_group_freq ].index )
200- ].tolist ()
201- print (f"Selected { len (sel_comb )} combinations" )
202- motif_combinations = motif_combinations [
203- motif_combinations .combination .isin (sel_comb )
204- ]
205- cts = cts [cts .combination .isin (sel_comb )]
206-
207- elif min_group_prop is not None :
208- cts ["group_total" ] = seqs .Group .value_counts ()[cts .Group ].tolist ()
209- cts ["group_prop" ] = cts ["count" ] / cts .group_total
210- # print(cts)
211- sel_comb = set (cts .combination [cts .group_prop > min_group_prop ])
212- print (f"Selected { len (sel_comb )} combinations" )
213- motif_combinations = motif_combinations [
214- motif_combinations .combination .isin (sel_comb )
215- ]
216- cts = cts [cts .combination .isin (sel_comb )]
217-
218- print ("Significance testing" )
219- df = seqs [[group_col ]].copy ()
333+ # FDR correction
334+ res ["padj" ] = fdrcorrection (res .pval )[1 ]
335+
336+ return res .reset_index (drop = True )
337+
338+
339+ def motif_pair_differential_distance (
340+ motif_pairs ,
341+ seqs ,
342+ reference_group ,
343+ group_col = "Group" ,
344+ max_prop_cutoff = 0 ,
345+ min_prop_cutoff = 0 ,
346+ ref_prop_cutoff = 0 ,
347+ ):
348+ """
349+ Compare the distance between all motif pairs across groups.
350+
351+ Args:
352+ motif_pairs (pd.DataFrame): Pandas dataframe containing the ouptut of
353+ get_motif_pairs.
354+ seqs (pd.DataFrame): Pandas dataframe containing sequences
355+ reference_group (str): ID of group to use as reference
356+ group_col (str): Name of column in `seqs` containing group IDs
357+ max_prop_cutoff (int): Limit to combinations with this proportion in
358+ at least one group.
359+ min_prop_cutoff (float): Limit to combinations with this proportion in
360+ in all groups.
361+
362+ Returns:
363+ res (pd.DataFrame): Pandas dataframe containing FDR-corrected significance
364+ testing results for the distance between paired motifs, between groups
365+ """
220366 res = pd .DataFrame ()
367+ motif_pairs = motif_pairs .merge (
368+ seqs [[group_col ]], left_index = True , right_index = True
369+ )
221370
222- for comb in cts .combination .unique ():
223- seqs_with_comb = motif_combinations [
224- motif_combinations .combination == comb
225- ].index .tolist ()
226- df ["has_comb" ] = df .index .isin (seqs_with_comb )
227- curr_res = groupwise_fishers (
228- df ,
371+ if (max_prop_cutoff > 0 ) or (min_prop_cutoff > 0 ) or (ref_prop_cutoff > 0 ):
372+ motif_pairs = _filter_motif_pairs (
373+ motif_pairs ,
374+ seqs ,
229375 reference_group = reference_group ,
230- val_col = "has_comb" ,
231- reference_val = None ,
376+ group_col = group_col ,
377+ max_prop_cutoff = max_prop_cutoff ,
378+ min_prop_cutoff = min_prop_cutoff ,
379+ ref_prop_cutoff = ref_prop_cutoff ,
380+ )
381+
382+ for pair in motif_pairs .MotifID .unique ():
383+ curr_res = groupwise_mann_whitney (
384+ motif_pairs [motif_pairs .MotifID == pair ],
385+ reference_group = reference_group ,
386+ val_col = "distance" ,
232387 group_col = group_col ,
233388 ).reset_index ()
234- curr_res ["combination " ] = [comb ] * len (curr_res )
389+ curr_res ["MotifID " ] = [pair ] * len (curr_res )
235390 res = pd .concat ([res , curr_res ])
236391
237392 # FDR correction
0 commit comments