Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 54 additions & 22 deletions modiscolite/affinitymat.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _sparse_vv_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, i,
return dot

@njit(parallel=True)
def _sparse_mm_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, k):
def _sparse_mm_dot_revcomp(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, k):
n_rows = len(Y_indptr) - 1

neighbors = np.empty((n_rows, k), dtype='int32')
Expand All @@ -64,35 +64,65 @@ def _sparse_mm_dot(X_data, X_indices, X_indptr, Y_data, Y_indices, Y_indptr, k):

return sims, neighbors

@njit(parallel=True)
def _sparse_mm_dot(X_data, X_indices, X_indptr, k):
n_rows = len(X_indptr) - 1

neighbors = np.empty((n_rows, k), dtype='int32')
sims = np.empty((n_rows, k), dtype='float64')

for i in prange(n_rows):
dot = np.zeros(n_rows, dtype='float64')

for j in range(n_rows):
xdot = _sparse_vv_dot(X_data, X_indices, X_indptr, X_data, X_indices, X_indptr, i, j)
dot[j] = xdot

dot_argsort = np.argsort(-dot, kind='mergesort')[:k]
neighbors[i] = dot_argsort
sims[i] = dot[dot_argsort]

return sims, neighbors

def cosine_similarity_from_seqlets(seqlets, n_neighbors, sign, topn=20,
min_k=4, max_k=6, max_gap=15, max_len=15, max_entries=500,
alphabet_size=4):
alphabet_size=4, revcomp=True):

X_fwd = gapped_kmer._seqlet_to_gkmers(seqlets, topn,
min_k, max_k, max_gap, max_len, max_entries, True, sign)

X_bwd = gapped_kmer._seqlet_to_gkmers(seqlets, topn, min_k, max_k, max_gap,
max_len, max_entries, False, sign)

X = sklearn.preprocessing.normalize(X_fwd, norm='l2', axis=1)
Y = sklearn.preprocessing.normalize(X_bwd, norm='l2', axis=1)

n, d = X.shape
k = min(n_neighbors+1, n)
return _sparse_mm_dot(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, k)

if revcomp:
X_bwd = gapped_kmer._seqlet_to_gkmers(seqlets, topn, min_k, max_k, max_gap,
max_len, max_entries, False, sign)
Y = sklearn.preprocessing.normalize(X_bwd, norm='l2', axis=1)
return _sparse_mm_dot_revcomp(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, k)
else:
return _sparse_mm_dot(X.data, X.indices, X.indptr, k)


def jaccard_from_seqlets(seqlets, min_overlap, filter_seqlets=None,
seqlet_neighbors=None):
seqlet_neighbors=None, revcomp=True):

all_fwd_data, all_rev_data = util.get_2d_data_from_patterns(seqlets)
if revcomp:
all_fwd_data, all_rev_data = util.get_2d_data_from_patterns(seqlets)

if filter_seqlets is None:
filter_seqlets = seqlets
filters_all_fwd_data = all_fwd_data
filters_all_rev_data = all_rev_data
if filter_seqlets is None:
filter_seqlets = seqlets
filters_all_fwd_data = all_fwd_data
filters_all_rev_data = all_rev_data
else:
filters_all_fwd_data, filters_all_rev_data = util.get_2d_data_from_patterns(filter_seqlets)

else:
filters_all_fwd_data, filters_all_rev_data = util.get_2d_data_from_patterns(filter_seqlets)
all_fwd_data = util.get_2d_data_from_patterns(seqlets, revcomp=False)
if filter_seqlets is None:
filter_seqlets = seqlets
filters_all_fwd_data = all_fwd_data
else:
filters_all_fwd_data = util.get_2d_data_from_patterns(filter_seqlets, revcomp=False)

if seqlet_neighbors is None:
seqlet_neighbors = [list(range(len(filter_seqlets)))
Expand All @@ -104,12 +134,14 @@ def jaccard_from_seqlets(seqlets, min_overlap, filter_seqlets=None,
Y=all_fwd_data, min_overlap=min_overlap, func=int,
return_sparse=True)

affmat_rev = jaccard(seqlet_neighbors=seqlet_neighbors,
X=filters_all_rev_data, Y=all_fwd_data,
min_overlap=min_overlap, func=int,
return_sparse=True)

affmat = np.maximum(affmat_fwd, affmat_rev)
if revcomp:
affmat_rev = jaccard(seqlet_neighbors=seqlet_neighbors,
X=filters_all_rev_data, Y=all_fwd_data,
min_overlap=min_overlap, func=int,
return_sparse=True)
affmat = np.maximum(affmat_fwd, affmat_rev)
else:
affmat = affmat_fwd
return affmat


Expand Down
64 changes: 38 additions & 26 deletions modiscolite/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,37 +68,47 @@ def _expand_seqlets_to_fill_pattern(pattern, track_set, left_flank_to_add,


def _align_patterns(parent_pattern, child_pattern, metric, min_overlap,
transformer, include_hypothetical):
transformer, include_hypothetical, revcomp):

fwd_data_parent, rev_data_parent = util.get_2d_data_from_patterns(
fwd_data_parent = util.get_2d_data_from_patterns(
[parent_pattern], transformer=transformer,
include_hypothetical=include_hypothetical)
include_hypothetical=include_hypothetical, revcomp=False)

fwd_data_child, rev_data_child = util.get_2d_data_from_patterns(
[child_pattern], transformer=transformer,
include_hypothetical=include_hypothetical)
if revcomp:
fwd_data_child, rev_data_child = util.get_2d_data_from_patterns(
[child_pattern], transformer=transformer,
include_hypothetical=include_hypothetical, revcomp=True)

best_crossmetric, best_crossmetric_argmax = metric(fwd_data_child,
fwd_data_parent, min_overlap).squeeze()

best_crossmetric_rev, best_crossmetric_argmax_rev = metric(rev_data_child,
fwd_data_parent, min_overlap).squeeze()
best_crossmetric, best_crossmetric_argmax = metric(fwd_data_child,
fwd_data_parent, min_overlap).squeeze()
best_crossmetric_rev, best_crossmetric_argmax_rev = metric(rev_data_child,
fwd_data_parent, min_overlap).squeeze()

if best_crossmetric_rev > best_crossmetric:
return int(best_crossmetric_argmax_rev), True, best_crossmetric_rev
if best_crossmetric_rev > best_crossmetric:
return int(best_crossmetric_argmax_rev), True, best_crossmetric_rev
else:
return int(best_crossmetric_argmax), False, best_crossmetric
else:
fwd_data_child = util.get_2d_data_from_patterns(
[child_pattern], transformer=transformer,
include_hypothetical=include_hypothetical, revcomp=False)

best_crossmetric, best_crossmetric_argmax = metric(fwd_data_child,
fwd_data_parent, min_overlap).squeeze()

return int(best_crossmetric_argmax), False, best_crossmetric


def merge_in_seqlets_filledges(parent_pattern, seqlets_to_merge,
track_set, metric, min_overlap, transformer='l1',
include_hypothetical=True):
include_hypothetical=True, revcomp=True):

parent_pattern = parent_pattern.copy()

for seqlet in seqlets_to_merge:
alnmt, revcomp_match, alnmt_score = _align_patterns(parent_pattern,
seqlet, metric, min_overlap, transformer, include_hypothetical)
seqlet, metric, min_overlap, transformer, include_hypothetical, revcomp)

if revcomp_match:
seqlet = seqlet.revcomp()
Expand Down Expand Up @@ -173,7 +183,7 @@ def _detect_spurious_merging(patterns, track_set, perplexity,
min_in_subcluster, min_overlap, prob_and_pertrack_sim_merge_thresholds,
prob_and_pertrack_sim_dealbreaker_thresholds,
min_frac, min_num, flank_to_add, window_size, bg_freq,
n_seeds, max_seqlets_subsample=1000):
n_seeds, revcomp, max_seqlets_subsample=1000):

to_return = []
for i, pattern in enumerate(patterns):
Expand All @@ -186,7 +196,7 @@ def _detect_spurious_merging(patterns, track_set, perplexity,
prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds,
prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds,
min_frac=min_frac, min_num=min_num, flank_to_add=flank_to_add, window_size=window_size,
bg_freq=bg_freq, max_seqlets_subsample=1000)
bg_freq=bg_freq, max_seqlets_subsample=max_seqlets_subsample, revcomp=revcomp)

to_return.extend(refined_subpatterns[0])
else:
Expand All @@ -197,13 +207,13 @@ def _detect_spurious_merging(patterns, track_set, perplexity,
prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds,
prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds,
min_frac=min_frac, min_num=min_num, flank_to_add=flank_to_add, window_size=window_size,
bg_freq=bg_freq, max_seqlets_subsample=1000)
bg_freq=bg_freq, max_seqlets_subsample=max_seqlets_subsample, revcomp=revcomp)

def SimilarPatternsCollapser(patterns, track_set,
min_overlap, prob_and_pertrack_sim_merge_thresholds,
prob_and_pertrack_sim_dealbreaker_thresholds,
min_frac, min_num, flank_to_add, window_size, bg_freq,
max_seqlets_subsample=1000):
revcomp, max_seqlets_subsample=1000):
patterns = [x.copy() for x in patterns]

merge_hierarchy_levels = []
Expand Down Expand Up @@ -256,7 +266,8 @@ def SimilarPatternsCollapser(patterns, track_set,
metric=affinitymat.pearson_correlation,
min_overlap=min_overlap,
include_hypothetical=False,
transformer='magnitude')
transformer='magnitude',
revcomp=revcomp)

pairwise_sims[i, j] = aligner_sim

Expand All @@ -281,11 +292,11 @@ def SimilarPatternsCollapser(patterns, track_set,
pattern2_shifted_seqlets = track_set.create_seqlets(
seqlets=pattern2_coords)

pattern1_fwdseqdata, _ =\
util.get_2d_data_from_patterns(subsample_patterns[i].seqlets)
pattern1_fwdseqdata =\
util.get_2d_data_from_patterns(subsample_patterns[i].seqlets, revcomp=False)

pattern2_fwdseqdata, _ =\
util.get_2d_data_from_patterns(pattern2_shifted_seqlets)
pattern2_fwdseqdata =\
util.get_2d_data_from_patterns(pattern2_shifted_seqlets, revcomp=False)

#Flatten, compute continjacc sim at this alignment
flat_pattern1_fwdseqdata = pattern1_fwdseqdata.reshape(
Expand Down Expand Up @@ -384,7 +395,8 @@ def SimilarPatternsCollapser(patterns, track_set,
metric=affinitymat.pearson_correlation,
min_overlap=min_overlap,
transformer='magnitude',
track_set=track_set)
track_set=track_set,
revcomp=revcomp)

new_pattern = polish_pattern(new_pattern, min_frac=min_frac,
min_num=min_num, track_set=track_set, flank=flank_to_add,
Expand Down Expand Up @@ -454,7 +466,7 @@ def SimilarPatternsCollapser(patterns, track_set,
old_pattern_node_found = True
if (old_pattern_node_found==False):
next_level_nodes.append(
PatternMergeHierarchyNode(frontier_pattern))
PatternMergeHierarchyNode(frontier_pattern))

for next_level_node in next_level_nodes:
#iterate over all the old patterns and their new parent
Expand Down
27 changes: 15 additions & 12 deletions modiscolite/tfmodisco.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _filter_patterns(patterns, min_seqlet_support, window_size,

def _patterns_from_clusters(seqlets, track_set, min_overlap,
min_frac, min_num, flank_to_add, window_size, bg_freq, cluster_indices,
track_sign):
track_sign, revcomp):

seqlet_sort_metric = lambda x: -np.sum(np.abs(x.contrib_scores))
num_clusters = max(cluster_indices+1)
Expand All @@ -106,7 +106,7 @@ def _patterns_from_clusters(seqlets, track_set, min_overlap,
pattern = aggregator.merge_in_seqlets_filledges(
parent_pattern=pattern, seqlets_to_merge=sorted_seqlets[1:],
track_set=track_set, metric=affinitymat.jaccard,
min_overlap=min_overlap)
min_overlap=min_overlap, revcomp=revcomp)

pattern = aggregator.polish_pattern(pattern, min_frac=min_frac,
min_num=min_num, track_set=track_set, flank=flank_to_add,
Expand Down Expand Up @@ -160,9 +160,9 @@ def seqlets_to_patterns(seqlets, track_set, track_signs=None,
min_num_to_trim_to=30, trim_to_window_size=20, initial_flank_to_add=5,
prob_and_pertrack_sim_merge_thresholds=[(0.8,0.8), (0.5, 0.85), (0.2, 0.9)],
prob_and_pertrack_sim_dealbreaker_thresholds=[(0.4, 0.75), (0.2,0.8), (0.1, 0.85), (0.0,0.9)],
subcluster_perplexity=50, merging_max_seqlets_subsample=300,
subcluster_perplexity=50, merging_max_seqlets_subsample=1000,
final_min_cluster_size=20,min_ic_in_window=0.6, min_ic_windowsize=6,
ppm_pseudocount=0.001):
ppm_pseudocount=0.001, revcomp=True):

bg_freq = np.mean([seqlet.sequence for seqlet in seqlets], axis=(0, 1))

Expand All @@ -177,12 +177,12 @@ def seqlets_to_patterns(seqlets, track_set, track_signs=None,

# Step 1: Generate coarse resolution
coarse_affmat_nn, seqlet_neighbors = affinitymat.cosine_similarity_from_seqlets(
seqlets=seqlets, n_neighbors=nearest_neighbors_to_compute, sign=track_signs)
seqlets=seqlets, n_neighbors=nearest_neighbors_to_compute, sign=track_signs, revcomp=revcomp)

# Step 2: Generate fine representation
fine_affmat_nn = affinitymat.jaccard_from_seqlets(
seqlets=seqlets, seqlet_neighbors=seqlet_neighbors,
min_overlap=min_overlap_while_sliding)
min_overlap=min_overlap_while_sliding, revcomp=revcomp)

if round_idx == 0:
filtered_seqlets, seqlet_neighbors, filtered_affmat_nn = (
Expand Down Expand Up @@ -221,7 +221,8 @@ def seqlets_to_patterns(seqlets, track_set, track_signs=None,
window_size=trim_to_window_size,
bg_freq=bg_freq,
cluster_indices=cluster_indices,
track_sign=track_signs)
track_sign=track_signs,
revcomp=revcomp)

#obtain unique seqlets from adjusted motifs
seqlets = list(dict([(y.string, y)
Expand All @@ -239,7 +240,7 @@ def seqlets_to_patterns(seqlets, track_set, track_signs=None,
flank_to_add=initial_flank_to_add,
window_size=trim_to_window_size, bg_freq=bg_freq,
max_seqlets_subsample=merging_max_seqlets_subsample,
n_seeds=n_leiden_runs)
n_seeds=n_leiden_runs, revcomp=revcomp)

#Now start merging patterns
merged_patterns = sorted(merged_patterns, key=lambda x: -len(x.seqlets))
Expand Down Expand Up @@ -268,9 +269,9 @@ def TFMoDISco(one_hot, hypothetical_contribs, sliding_window_size=21,
initial_flank_to_add=5,
prob_and_pertrack_sim_merge_thresholds=[(0.8,0.8), (0.5, 0.85), (0.2, 0.9)],
prob_and_pertrack_sim_dealbreaker_thresholds=[(0.4, 0.75), (0.2,0.8), (0.1, 0.85), (0.0,0.9)],
subcluster_perplexity=50, merging_max_seqlets_subsample=300,
subcluster_perplexity=50, merging_max_seqlets_subsample=1000,
final_min_cluster_size=20, min_ic_in_window=0.6, min_ic_windowsize=6,
ppm_pseudocount=0.001, verbose=False):
ppm_pseudocount=0.001, revcomp=True, verbose=False):

contrib_scores = np.multiply(one_hot, hypothetical_contribs)

Expand Down Expand Up @@ -327,7 +328,8 @@ def TFMoDISco(one_hot, hypothetical_contribs, sliding_window_size=21,
final_min_cluster_size=final_min_cluster_size,
min_ic_in_window=min_ic_in_window,
min_ic_windowsize=min_ic_windowsize,
ppm_pseudocount=ppm_pseudocount)
ppm_pseudocount=ppm_pseudocount,
revcomp=revcomp)
else:
pos_patterns = None

Expand Down Expand Up @@ -356,7 +358,8 @@ def TFMoDISco(one_hot, hypothetical_contribs, sliding_window_size=21,
final_min_cluster_size=final_min_cluster_size,
min_ic_in_window=min_ic_in_window,
min_ic_windowsize=min_ic_windowsize,
ppm_pseudocount=ppm_pseudocount)
ppm_pseudocount=ppm_pseudocount,
revcomp=revcomp)
else:
neg_patterns = None

Expand Down
17 changes: 11 additions & 6 deletions modiscolite/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,24 +116,29 @@ def l1(X):
return X
return (X/abs_sum)

def get_2d_data_from_patterns(patterns, transformer='l1', include_hypothetical=True):
def get_2d_data_from_patterns(patterns, transformer='l1', include_hypothetical=True, revcomp=True):
func = l1 if transformer == 'l1' else magnitude
tracks = ['hypothetical_contribs', 'contrib_scores']
if not include_hypothetical:
tracks = tracks[1:]

all_fwd_data, all_rev_data = [], []
all_fwd_data = []
all_rev_data = []

for pattern in patterns:
snippets = [getattr(pattern, track) for track in tracks]

fwd_data = np.concatenate([func(snippet) for snippet in snippets], axis=1)
rev_data = np.concatenate([func(snippet[::-1, ::-1]) for snippet in snippets], axis=1)

all_fwd_data.append(fwd_data)
all_rev_data.append(rev_data)

return np.array(all_fwd_data), np.array(all_rev_data)
if revcomp:
rev_data = np.concatenate([func(snippet[::-1, ::-1]) for snippet in snippets], axis=1)
all_rev_data.append(rev_data)

if revcomp:
return np.array(all_fwd_data), np.array(all_rev_data)
else:
return np.array(all_fwd_data)


def calculate_window_offsets(center: int, window_size: int) -> tuple:
Expand Down