Skip to content

Commit f15d0fd

Browse files
avantikalallala8
andauthored
Motif syntax (#15)
* added functions for motif orientation and distance comparisons between groups * start all pair calculations from FIMO output * updated groupwise tests to return mean and log2FC * reran yeast tutorial * reran human tutorial * removed fastsk requirement --------- Co-authored-by: lala8 <[email protected]>
1 parent 5164a08 commit f15d0fd

File tree

7 files changed

+8246
-4904
lines changed

7 files changed

+8246
-4904
lines changed

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ install_requires =
6161
scikit_posthocs
6262
scipy
6363
plotnine
64+
# fastsk
6465

6566

6667
[options.packages.find]

src/polygraph/classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def groupwise_svm(
6666
).codes
6767

6868
# Train SVM
69-
svm = LinearSVC(C=2, max_iter=max_iter)
69+
svm = LinearSVC(C=2, max_iter=max_iter, dual=True)
7070
clf = CalibratedClassifierCV(svm, cv=cv).fit(Xtrain, Ytrain)
7171

7272
# Get predictions

src/polygraph/motifs.py

Lines changed: 223 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def scan(seqs, meme_file, group_col="Group", pthresh=1e-3, rc=True):
2020
rc (bool): Whether to scan the sequence reverse complement as well
2121
2222
Returns:
23-
pd.DataFrame containing columns 'Matrix_id', 'seq_id', 'start', 'end', 'strand'.
23+
pd.DataFrame containing columns 'MotifID', 'SeqID', 'start', 'end', 'strand'.
2424
"""
2525
from collections import defaultdict
2626

@@ -137,101 +137,256 @@ def nmf(counts, seqs, reference_group, group_col="Group", n_components=10):
137137
return W, H, res
138138

139139

140-
def motif_combinations(
141-
counts,
140+
def get_motif_pairs(sites):
141+
"""
142+
List the pairs of motifs present in each sequence.
143+
144+
Args:
145+
sites (pd.DataFrame): Pandas dataframe containing FIMO output.
146+
147+
Returns:
148+
pairs (pd.DataFrame): Dataframe containing all motif pairs in each
149+
sequence with their orientation and distance.
150+
"""
151+
# Get midpoint of motif
152+
sites["mid"] = sites["start"] + ((sites["end"] - sites["start"]) / 2)
153+
154+
# List the motif, strand and position for each site
155+
df = sites[["SeqID"]].copy()
156+
df["data"] = sites[["MotifID", "strand", "mid"]].apply(
157+
lambda row: row.tolist(), axis=1
158+
)
159+
160+
# Get all pairs of sites present in each sequence
161+
pairs = pd.DataFrame(
162+
df.groupby("SeqID")["data"].apply(lambda x: list(itertools.combinations(x, 2))),
163+
columns=["data"],
164+
)
165+
pairs = pairs.explode("data")
166+
167+
# Get the orientation and distance for each pair of motifs
168+
pairs = pd.DataFrame(
169+
pairs.data.apply(lambda x: list(zip(*x))).tolist(),
170+
index=pairs.index,
171+
columns=["MotifID", "strand", "pos"],
172+
)
173+
pairs["orientation"] = pairs.strand.apply(
174+
lambda x: "same" if len(set(x)) == 1 else "opposite"
175+
)
176+
pairs["distance"] = pairs.pos.apply(lambda x: np.abs(x[1] - x[0]))
177+
return pairs[["MotifID", "orientation", "distance"]]
178+
179+
180+
def _filter_motif_pairs(
181+
motif_pairs,
182+
seqs,
183+
reference_group=None,
184+
group_col="Group",
185+
min_prop_cutoff=0,
186+
max_prop_cutoff=0,
187+
ref_prop_cutoff=0,
188+
):
189+
# Count occurrence of motif pairs in each group
190+
cts = motif_pairs[["MotifID", group_col]].reset_index().drop_duplicates()
191+
cts = cts[["MotifID", group_col]].value_counts().reset_index(name="count")
192+
cts["group_total"] = seqs[group_col].value_counts()[cts[group_col]].tolist()
193+
cts["group_prop"] = cts["count"] / cts.group_total
194+
cts = cts.pivot_table(
195+
index="MotifID", columns=group_col, values="group_prop"
196+
).fillna(0)
197+
198+
# Filter rare pairs by proportion
199+
sel_pairs = set(
200+
cts.index[(cts.max(1) >= max_prop_cutoff) & (cts.min(1) >= min_prop_cutoff)]
201+
)
202+
if ref_prop_cutoff > 0:
203+
assert reference_group is not None
204+
sel_pairs = sel_pairs.intersection(
205+
cts.index[cts[reference_group] >= ref_prop_cutoff]
206+
)
207+
208+
print(f"Selected {len(sel_pairs)} pairs based on in-group proportion")
209+
motif_pairs = motif_pairs[motif_pairs.MotifID.isin(sel_pairs)]
210+
211+
return motif_pairs.copy()
212+
213+
214+
def motif_pair_differential_abundance(
215+
motif_pairs,
142216
seqs,
143217
reference_group,
144218
group_col="Group",
145-
min_group_freq=10,
146-
min_group_prop=None,
219+
max_prop_cutoff=0,
220+
min_prop_cutoff=0,
221+
ref_prop_cutoff=0,
147222
):
148223
"""
149-
Count occurences of pairwise combinations of motifs and compare between groups
224+
Compare the rate of occurence of pairwise combinations of motifs between groups
150225
151226
Args:
152-
counts (pd.DataFrame): Pandas dataframe containing the motif count matrix.
153-
Rows should be sequences and columns should be motifs.
227+
motif_pairs (pd.DataFrame): Pandas dataframe containing the ouptut of
228+
get_motif_pairs.
154229
seqs (pd.DataFrame): Pandas dataframe containing sequences
155230
reference_group (str): ID of group to use as reference
156231
group_col (str): Name of column in `seqs` containing group IDs
157-
min_group_freq (int): Limit to combinations with this number of occurences in
232+
max_prop_cutoff (int): Limit to combinations with this proportion in
158233
at least one group.
159-
min_group_prop (float): Limit to combinations with this proportion of occurences
160-
in at least one group.
234+
min_prop_cutoff (float): Limit to combinations with this proportion in
235+
in all groups.
161236
162237
Returns:
163238
res (pd.DataFrame): Pandas dataframe containing FDR-corrected significance
164239
testing results for the occurrence of pairwise combinations between groups
165240
"""
241+
res = pd.DataFrame()
242+
motif_pairs = motif_pairs.merge(
243+
seqs[[group_col]], left_index=True, right_index=True
244+
)
166245

167-
print("Listing motif combinations")
246+
if (max_prop_cutoff > 0) or (min_prop_cutoff > 0) or (ref_prop_cutoff > 0):
247+
motif_pairs = _filter_motif_pairs(
248+
motif_pairs,
249+
seqs,
250+
reference_group=reference_group,
251+
group_col=group_col,
252+
max_prop_cutoff=max_prop_cutoff,
253+
min_prop_cutoff=min_prop_cutoff,
254+
ref_prop_cutoff=ref_prop_cutoff,
255+
)
168256

169-
# Get the complete set of motifs present in each sequence
170-
motif_combinations = pd.DataFrame(
171-
counts.apply(lambda row: set(counts.columns[np.where(row > 0)[0]]), axis=1)
172-
)
173-
motif_combinations.columns = ["motifs"]
174-
motif_combinations = motif_combinations.merge(
257+
df = seqs[[group_col]].copy()
258+
for pair in motif_pairs.MotifID.unique():
259+
seqs_with_pair = motif_pairs[motif_pairs.MotifID == pair].index
260+
df["has_pair"] = df.index.isin(seqs_with_pair)
261+
curr_res = groupwise_fishers(
262+
df,
263+
reference_group=reference_group,
264+
val_col="has_pair",
265+
reference_val=None,
266+
group_col=group_col,
267+
).reset_index()
268+
curr_res["MotifID"] = [pair] * len(curr_res)
269+
res = pd.concat([res, curr_res])
270+
271+
# FDR correction
272+
res["padj"] = fdrcorrection(res.pval)[1]
273+
274+
return res.reset_index(drop=True)
275+
276+
277+
def motif_pair_differential_orientation(
278+
motif_pairs,
279+
seqs,
280+
reference_group,
281+
group_col="Group",
282+
max_prop_cutoff=0,
283+
min_prop_cutoff=0,
284+
ref_prop_cutoff=0,
285+
):
286+
"""
287+
Compare the mutual orientation of all motif pairs between groups.
288+
289+
Args:
290+
motif_pairs (pd.DataFrame): Pandas dataframe containing the ouptut of
291+
get_motif_pairs.
292+
seqs (pd.DataFrame): Pandas dataframe containing sequences
293+
reference_group (str): ID of group to use as reference
294+
group_col (str): Name of column in `seqs` containing group IDs
295+
max_prop_cutoff (int): Limit to combinations with this proportion in
296+
at least one group.
297+
min_prop_cutoff (float): Limit to combinations with this proportion in
298+
in all groups.
299+
300+
Returns:
301+
res (pd.DataFrame): Pandas dataframe containing FDR-corrected significance
302+
testing results for the mutual orientation of pairwise combinations
303+
between groups
304+
305+
"""
306+
res = pd.DataFrame()
307+
motif_pairs = motif_pairs.merge(
175308
seqs[[group_col]], left_index=True, right_index=True
176309
)
177310

178-
# Get pairwise combinations present in each sequence
179-
motif_combinations["combination"] = motif_combinations.motifs.apply(
180-
lambda x: list(itertools.combinations(x, 2))
181-
)
182-
motif_combinations = motif_combinations[["combination", group_col]].explode(
183-
"combination"
184-
)
311+
if (max_prop_cutoff > 0) or (min_prop_cutoff > 0) or (ref_prop_cutoff > 0):
312+
motif_pairs = _filter_motif_pairs(
313+
motif_pairs,
314+
seqs,
315+
reference_group=reference_group,
316+
group_col=group_col,
317+
max_prop_cutoff=max_prop_cutoff,
318+
min_prop_cutoff=min_prop_cutoff,
319+
ref_prop_cutoff=ref_prop_cutoff,
320+
)
185321

186-
print("Making count matrix")
187-
# Count the number of sequences in which each motif combination is present
188-
cts = (
189-
motif_combinations[["combination", group_col]]
190-
.value_counts()
191-
.reset_index(name="count")
192-
)
322+
for pair in motif_pairs.MotifID.unique():
323+
curr_res = groupwise_fishers(
324+
motif_pairs[motif_pairs.MotifID == pair],
325+
reference_group=reference_group,
326+
val_col="orientation",
327+
reference_val="same",
328+
group_col=group_col,
329+
).reset_index()
330+
curr_res["MotifID"] = [pair] * len(curr_res)
331+
res = pd.concat([res, curr_res])
193332

194-
print("Filtering")
195-
# Drop rare combinations
196-
if min_group_freq is not None:
197-
comb_max = cts.groupby("combination")["count"].max()
198-
sel_comb = cts.combination[
199-
cts.combination.isin(comb_max[comb_max > min_group_freq].index)
200-
].tolist()
201-
print(f"Selected {len(sel_comb)} combinations")
202-
motif_combinations = motif_combinations[
203-
motif_combinations.combination.isin(sel_comb)
204-
]
205-
cts = cts[cts.combination.isin(sel_comb)]
206-
207-
elif min_group_prop is not None:
208-
cts["group_total"] = seqs.Group.value_counts()[cts.Group].tolist()
209-
cts["group_prop"] = cts["count"] / cts.group_total
210-
# print(cts)
211-
sel_comb = set(cts.combination[cts.group_prop > min_group_prop])
212-
print(f"Selected {len(sel_comb)} combinations")
213-
motif_combinations = motif_combinations[
214-
motif_combinations.combination.isin(sel_comb)
215-
]
216-
cts = cts[cts.combination.isin(sel_comb)]
217-
218-
print("Significance testing")
219-
df = seqs[[group_col]].copy()
333+
# FDR correction
334+
res["padj"] = fdrcorrection(res.pval)[1]
335+
336+
return res.reset_index(drop=True)
337+
338+
339+
def motif_pair_differential_distance(
340+
motif_pairs,
341+
seqs,
342+
reference_group,
343+
group_col="Group",
344+
max_prop_cutoff=0,
345+
min_prop_cutoff=0,
346+
ref_prop_cutoff=0,
347+
):
348+
"""
349+
Compare the distance between all motif pairs across groups.
350+
351+
Args:
352+
motif_pairs (pd.DataFrame): Pandas dataframe containing the ouptut of
353+
get_motif_pairs.
354+
seqs (pd.DataFrame): Pandas dataframe containing sequences
355+
reference_group (str): ID of group to use as reference
356+
group_col (str): Name of column in `seqs` containing group IDs
357+
max_prop_cutoff (int): Limit to combinations with this proportion in
358+
at least one group.
359+
min_prop_cutoff (float): Limit to combinations with this proportion in
360+
in all groups.
361+
362+
Returns:
363+
res (pd.DataFrame): Pandas dataframe containing FDR-corrected significance
364+
testing results for the distance between paired motifs, between groups
365+
"""
220366
res = pd.DataFrame()
367+
motif_pairs = motif_pairs.merge(
368+
seqs[[group_col]], left_index=True, right_index=True
369+
)
221370

222-
for comb in cts.combination.unique():
223-
seqs_with_comb = motif_combinations[
224-
motif_combinations.combination == comb
225-
].index.tolist()
226-
df["has_comb"] = df.index.isin(seqs_with_comb)
227-
curr_res = groupwise_fishers(
228-
df,
371+
if (max_prop_cutoff > 0) or (min_prop_cutoff > 0) or (ref_prop_cutoff > 0):
372+
motif_pairs = _filter_motif_pairs(
373+
motif_pairs,
374+
seqs,
229375
reference_group=reference_group,
230-
val_col="has_comb",
231-
reference_val=None,
376+
group_col=group_col,
377+
max_prop_cutoff=max_prop_cutoff,
378+
min_prop_cutoff=min_prop_cutoff,
379+
ref_prop_cutoff=ref_prop_cutoff,
380+
)
381+
382+
for pair in motif_pairs.MotifID.unique():
383+
curr_res = groupwise_mann_whitney(
384+
motif_pairs[motif_pairs.MotifID == pair],
385+
reference_group=reference_group,
386+
val_col="distance",
232387
group_col=group_col,
233388
).reset_index()
234-
curr_res["combination"] = [comb] * len(curr_res)
389+
curr_res["MotifID"] = [pair] * len(curr_res)
235390
res = pd.concat([res, curr_res])
236391

237392
# FDR correction

0 commit comments

Comments
 (0)