Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .beta import archr_model21
from .beta import beta
from .beta import marge
from .maestro import rp_enhanced
from .maestro import rp_simple
31 changes: 23 additions & 8 deletions openproblems/tasks/regulatory_effect_prediction/methods/beta.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from ....patch import patch_datacache
from ....tools.decorators import method
from .maestro import _rp_enhanced
from .maestro import _rp_simple

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -28,6 +30,7 @@ def _chrom_limit(x, tss_size=2e5):
return [gene_end - tss_size // 2, gene_end + tss_size // 2]


# included 2-3 lines to make it run with exons.
def _get_annotation(adata, retries=3):
"""Insert meta data into adata.obs."""
from pyensembl import EnsemblRelease
Expand Down Expand Up @@ -55,12 +58,7 @@ def _get_annotation(adata, retries=3):
)
gene = data.gene_by_id(i)
genes.append(
[
"chr%s" % gene.contig,
gene.start,
gene.end,
gene.strand,
]
["chr%s" % gene.contig, gene.start, gene.end, gene.strand, gene.exons]
)
except ValueError:
try:
Expand All @@ -76,6 +74,7 @@ def _get_annotation(adata, retries=3):
gene.start,
gene.end,
gene.strand,
gene.exons,
]
)
except (IndexError, ValueError) as e:
Expand All @@ -86,7 +85,7 @@ def _get_annotation(adata, retries=3):
[adata.var, pd.DataFrame(genes, index=adata.var_names)], axis=1
)
adata.var.columns = np.hstack(
[old_col, np.array(["chr", "start", "end", "strand"])]
[old_col, np.array(["chr", "start", "end", "strand", "exons"])]
)


Expand Down Expand Up @@ -135,7 +134,7 @@ def _filter_has_chr(adata):
return adata


def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta", **kwargs):
"""Calculate gene scores and insert into .obsm."""
import pybedtools

Expand Down Expand Up @@ -191,6 +190,8 @@ def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
axis=1,
)

extend_tss["gene_short_name"] = adata.var["gene_short_name"]

# peak summits
peaks = pd.DataFrame(
{
Expand All @@ -211,8 +212,10 @@ def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
)

# overlap TSS bins with peaks

x = pybedtools.BedTool.from_dataframe(summits)
y = pybedtools.BedTool.from_dataframe(extend_tss)

tss_to_peaks = x.intersect(y, wb=True, wa=True, loj=True).to_dataframe()

# remove non-overlapped TSS and peaks
Expand All @@ -226,6 +229,18 @@ def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
_archr_model21(tss_to_peaks, adata)
elif method == "marge":
_marge(tss_to_peaks, adata)
elif method == "rp_simple":
_rp_simple(tss_to_peaks, adata)
elif method == "rp_enhanced":
_rp_enhanced(tss_to_peaks, adata)

# the genes and gene_scores have to have the same dimensions
same_shape = adata.shape == adata.obsm["gene_score"].shape
if not same_shape:
print(adata.shape, adata.obsm["gene_score"].shape)
print("dimensions are not the same. Check calculation/filters")
assert same_shape

return adata


Expand Down
200 changes: 200 additions & 0 deletions openproblems/tasks/regulatory_effect_prediction/methods/maestro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
from ....tools.decorators import method

import pandas as pd


def _rp_simple(tss_to_peaks, adata, log_each=500):
import numpy as np
import scipy

# the coordinates of the current peaks
peaks = adata.uns["mode2_var"]

decay = 10000

def Sg(x):
return 2 ** (-x)

# gene_distance = 15 * decay

weights = []

tss_to_peaks = tss_to_peaks.drop_duplicates("itemRgb")

# print(tss_to_peaks.shape)
# print(tss_to_peaks.head())

for ri, r in tss_to_peaks.iterrows():
wi = 0
summit_chr, tss_summit_start, tss_summit_end = r[:3]
tss_extend_chr, tss_extend_start, tss_extend_end = r[4:7]

# print(summit_chr, summit_start, summit_end,
# extend_chr, extend_start, extend_end)
sel_chr = [pi for pi in peaks if pi[0] == tss_extend_chr]
sel_peaks = [
pi
for pi in sel_chr
if int(pi[1]) >= tss_extend_start and int(pi[2]) <= tss_extend_end
]

# print('# peaks in chromosome', len(sel_chr),
# '# of peaks around tss', len(sel_peaks))
# if len(sel_peaks) > 0:
# print(sel_peaks)

# if peaks then this is take them into account, one by one
for pi in sel_peaks:
summit_peak = int((int(pi[2]) + int(pi[1])) / 2)
distance = np.abs(tss_summit_start - summit_peak)
# print(pi, distance, Sg(distance / decay))
wi += Sg(distance / decay)

if log_each is not None and len(weights) % log_each == 0:
if len(weights) > 0:
print(
"# weights calculated so far",
len(weights),
"out of",
tss_to_peaks.shape[0],
)
weights.append(wi)

tss_to_peaks["weight"] = weights

gene_peak_weight = scipy.sparse.csr_matrix(
(
tss_to_peaks.weight.values,
(tss_to_peaks.thickEnd.astype("int32").values, tss_to_peaks.name.values),
),
shape=(adata.shape[1], adata.uns["mode2_var"].shape[0]),
)

adata.obsm["gene_score"] = adata.obsm["mode2"] @ gene_peak_weight.T


def _rp_enhanced(tss_to_peaks, adata, log_each=500):
import numpy as np
import scipy

# prepare the exonic ranges
exon_ranges = []
for exons in adata.var["exons"]:
exon_ranges.append([e.start, e.end] for e in exons)
adata.var["exon.ranges"] = exon_ranges
exon_coordinates_by_gene = adata.var["exon.ranges"].to_dict()

tss_to_peaks["exon.ranges"] = tss_to_peaks["itemRgb"].map(exon_coordinates_by_gene)
tss_to_peaks = tss_to_peaks.drop_duplicates("itemRgb").reset_index(drop=True)

# the coordinates of the current peaks
peaks = adata.uns["mode2_var"]

decay = 10000

def Sg(x):
return 2 ** (-x)

print("calculating weights per gene...")
weights = []
for ri, r in tss_to_peaks.iterrows():
wi = 0
summit_chr, tss_summit_start, tss_summit_end = r[:3]
tss_extend_chr, tss_extend_start, tss_extend_end = r[4:7]

# gene_name = r[-2]
exon_ranges = r[-1]

# print(summit_chr, tss_summit_start, tss_summit_end,
# tss_extend_chr, tss_extend_start, tss_extend_end,
# gene_name, exon_ranges)
sel_chr = [pi for pi in peaks if pi[0] == tss_extend_chr]
sel_peaks = [
pi
for pi in sel_chr
if int(pi[1]) >= tss_extend_start and int(pi[2]) <= tss_extend_end
]
# check whether the peak overlaps with a given exon
if not pd.isnull(exon_ranges):
sel_peak_summits = [(int(pi[1]) + int(pi[2])) / 2.0 for pi in sel_peaks]
peak_in_exons = [
np.sum([ps >= ex[0] and ps <= ex[1] for ex in exon_ranges]) >= 1
for ps in sel_peak_summits
]
else:
peak_in_exons = [False for pi in sel_peaks]

# if sum(peak_in_exons) > 0:
# print ('exon / peak overlap found!')
# print(ri, peak_in_exons)
# if peaks then this is take them into account, one by one
for pi, peak_in_exon in zip(sel_peaks, peak_in_exons):
# the current peak is part of an exon
# if peak_in_exon:
# print(pi)
summit_peak = int((int(pi[2]) + int(pi[1])) / 2)
distance = np.abs(tss_summit_start - summit_peak)
# print(pi, distance, Sg(distance / decay))
wi += Sg(distance / decay) if not peak_in_exon else 1.0

if log_each is not None and len(weights) % log_each == 0:
if len(weights) > 0:
print(
"# weights calculated so far",
len(weights),
"out of",
tss_to_peaks.shape[0],
)

weights.append(wi)

out = tss_to_peaks.copy()
out["weight"] = weights

gene_peak_weight = scipy.sparse.csr_matrix(
(
out.weight.values,
(out.thickEnd.astype("int32").values, out.name.values),
),
shape=(adata.shape[1], adata.uns["mode2_var"].shape[0]),
)

adata.obsm["gene_score"] = adata.obsm["mode2"] @ gene_peak_weight.T


@method(
method_name="RP_simple",
paper_name="""Integrative analyses of single-cell transcriptome\
and regulome using MAESTRO.""",
paper_url="https://pubmed.ncbi.nlm.nih.gov/32767996",
paper_year=2020,
code_version="1.0",
code_url="https://github.com/liulab-dfci/MAESTRO",
image="openproblems-python-extras",
)
def rp_simple(adata, n_top_genes=2000):
from .beta import _atac_genes_score

adata = _atac_genes_score(
adata,
top_genes=n_top_genes,
method="rp_simple",
)
return adata


@method(
method_name="RP_enhanced",
paper_name="""Integrative analyses of single-cell transcriptome\
and regulome using MAESTRO.""",
paper_url="https://pubmed.ncbi.nlm.nih.gov/32767996",
paper_year=2020,
code_version="1.0",
code_url="https://github.com/liulab-dfci/MAESTRO",
image="openproblems-python-extras",
)
def rp_enhanced(adata, n_top_genes=2000):
from .beta import _atac_genes_score

adata = _atac_genes_score(adata, top_genes=n_top_genes, method="rp_enhanced")
return adata
Loading