qinqian · ilibarra · Mar 30, 2021 · Mar 30, 2021 · Mar 30, 2021 · Mar 30, 2021
diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/__init__.py b/openproblems/tasks/regulatory_effect_prediction/methods/__init__.py
@@ -1,3 +1,5 @@
 from .beta import archr_model21
 from .beta import beta
 from .beta import marge
+from .maestro import rp_enhanced
+from .maestro import rp_simple
diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/beta.py b/openproblems/tasks/regulatory_effect_prediction/methods/beta.py
@@ -1,5 +1,7 @@
 from ....patch import patch_datacache
 from ....tools.decorators import method
+from .maestro import _rp_enhanced
+from .maestro import _rp_simple
 
 import numpy as np
 import pandas as pd
@@ -28,6 +30,7 @@ def _chrom_limit(x, tss_size=2e5):
         return [gene_end - tss_size // 2, gene_end + tss_size // 2]
 
 
+# included 2-3 lines to make it run with exons.
 def _get_annotation(adata, retries=3):
     """Insert meta data into adata.obs."""
     from pyensembl import EnsemblRelease
@@ -55,12 +58,7 @@ def _get_annotation(adata, retries=3):
                 )
                 gene = data.gene_by_id(i)
             genes.append(
-                [
-                    "chr%s" % gene.contig,
-                    gene.start,
-                    gene.end,
-                    gene.strand,
-                ]
+                ["chr%s" % gene.contig, gene.start, gene.end, gene.strand, gene.exons]
             )
         except ValueError:
             try:
@@ -76,6 +74,7 @@ def _get_annotation(adata, retries=3):
                         gene.start,
                         gene.end,
                         gene.strand,
+                        gene.exons,
                     ]
                 )
             except (IndexError, ValueError) as e:
@@ -86,7 +85,7 @@ def _get_annotation(adata, retries=3):
         [adata.var, pd.DataFrame(genes, index=adata.var_names)], axis=1
     )
     adata.var.columns = np.hstack(
-        [old_col, np.array(["chr", "start", "end", "strand"])]
+        [old_col, np.array(["chr", "start", "end", "strand", "exons"])]
     )
 
 
@@ -135,7 +134,7 @@ def _filter_has_chr(adata):
     return adata
 
 
-def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
+def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta", **kwargs):
     """Calculate gene scores and insert into .obsm."""
     import pybedtools
 
@@ -191,6 +190,8 @@ def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
         axis=1,
     )
 
+    extend_tss["gene_short_name"] = adata.var["gene_short_name"]
+
     # peak summits
     peaks = pd.DataFrame(
         {
@@ -211,8 +212,10 @@ def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
     )
 
     # overlap TSS bins with peaks
+
     x = pybedtools.BedTool.from_dataframe(summits)
     y = pybedtools.BedTool.from_dataframe(extend_tss)
+
     tss_to_peaks = x.intersect(y, wb=True, wa=True, loj=True).to_dataframe()
 
     # remove non-overlapped TSS and peaks
@@ -226,6 +229,18 @@ def _atac_genes_score(adata, top_genes=2000, threshold=1, method="beta"):
         _archr_model21(tss_to_peaks, adata)
     elif method == "marge":
         _marge(tss_to_peaks, adata)
+    elif method == "rp_simple":
+        _rp_simple(tss_to_peaks, adata)
+    elif method == "rp_enhanced":
+        _rp_enhanced(tss_to_peaks, adata)
+
+    # the genes and gene_scores have to have the same dimensions
+    same_shape = adata.shape == adata.obsm["gene_score"].shape
+    if not same_shape:
+        print(adata.shape, adata.obsm["gene_score"].shape)
+        print("dimensions are not the same. Check calculation/filters")
+        assert same_shape
+
     return adata
 
 

diff --git a/openproblems/tasks/regulatory_effect_prediction/methods/maestro.py b/openproblems/tasks/regulatory_effect_prediction/methods/maestro.py
@@ -0,0 +1,200 @@
+from ....tools.decorators import method
+
+import pandas as pd
+
+
+def _rp_simple(tss_to_peaks, adata, log_each=500):
+    import numpy as np
+    import scipy
+
+    # the coordinates of the current peaks
+    peaks = adata.uns["mode2_var"]
+
+    decay = 10000
+
+    def Sg(x):
+        return 2 ** (-x)
+
+    # gene_distance = 15 * decay
+
+    weights = []
+
+    tss_to_peaks = tss_to_peaks.drop_duplicates("itemRgb")
+
+    # print(tss_to_peaks.shape)
+    # print(tss_to_peaks.head())
+
+    for ri, r in tss_to_peaks.iterrows():
+        wi = 0
+        summit_chr, tss_summit_start, tss_summit_end = r[:3]
+        tss_extend_chr, tss_extend_start, tss_extend_end = r[4:7]
+
+        # print(summit_chr, summit_start, summit_end,
+        #       extend_chr, extend_start, extend_end)
+        sel_chr = [pi for pi in peaks if pi[0] == tss_extend_chr]
+        sel_peaks = [
+            pi
+            for pi in sel_chr
+            if int(pi[1]) >= tss_extend_start and int(pi[2]) <= tss_extend_end
+        ]
+
+        # print('# peaks in chromosome', len(sel_chr),
+        #       '# of peaks around tss', len(sel_peaks))
+        # if len(sel_peaks) > 0:
+        # print(sel_peaks)
+
+        # if peaks then this is take them into account, one by one
+        for pi in sel_peaks:
+            summit_peak = int((int(pi[2]) + int(pi[1])) / 2)
+            distance = np.abs(tss_summit_start - summit_peak)
+            # print(pi, distance, Sg(distance / decay))
+            wi += Sg(distance / decay)
+
+        if log_each is not None and len(weights) % log_each == 0:
+            if len(weights) > 0:
+                print(
+                    "# weights calculated so far",
+                    len(weights),
+                    "out of",
+                    tss_to_peaks.shape[0],
+                )
+        weights.append(wi)
+
+    tss_to_peaks["weight"] = weights
+
+    gene_peak_weight = scipy.sparse.csr_matrix(
+        (
+            tss_to_peaks.weight.values,
+            (tss_to_peaks.thickEnd.astype("int32").values, tss_to_peaks.name.values),
+        ),
+        shape=(adata.shape[1], adata.uns["mode2_var"].shape[0]),
+    )
+
+    adata.obsm["gene_score"] = adata.obsm["mode2"] @ gene_peak_weight.T
+
+
+def _rp_enhanced(tss_to_peaks, adata, log_each=500):
+    import numpy as np
+    import scipy
+
+    # prepare the exonic ranges
+    exon_ranges = []
+    for exons in adata.var["exons"]:
+        exon_ranges.append([e.start, e.end] for e in exons)
+    adata.var["exon.ranges"] = exon_ranges
+    exon_coordinates_by_gene = adata.var["exon.ranges"].to_dict()
+
+    tss_to_peaks["exon.ranges"] = tss_to_peaks["itemRgb"].map(exon_coordinates_by_gene)
+    tss_to_peaks = tss_to_peaks.drop_duplicates("itemRgb").reset_index(drop=True)
+
+    # the coordinates of the current peaks
+    peaks = adata.uns["mode2_var"]
+
+    decay = 10000
+
+    def Sg(x):
+        return 2 ** (-x)
+
+    print("calculating weights per gene...")
+    weights = []
+    for ri, r in tss_to_peaks.iterrows():
+        wi = 0
+        summit_chr, tss_summit_start, tss_summit_end = r[:3]
+        tss_extend_chr, tss_extend_start, tss_extend_end = r[4:7]
+
+        # gene_name = r[-2]
+        exon_ranges = r[-1]
+
+        # print(summit_chr, tss_summit_start, tss_summit_end,
+        #       tss_extend_chr, tss_extend_start, tss_extend_end,
+        #      gene_name, exon_ranges)
+        sel_chr = [pi for pi in peaks if pi[0] == tss_extend_chr]
+        sel_peaks = [
+            pi
+            for pi in sel_chr
+            if int(pi[1]) >= tss_extend_start and int(pi[2]) <= tss_extend_end
+        ]
+        # check whether the peak overlaps with a given exon
+        if not pd.isnull(exon_ranges):
+            sel_peak_summits = [(int(pi[1]) + int(pi[2])) / 2.0 for pi in sel_peaks]
+            peak_in_exons = [
+                np.sum([ps >= ex[0] and ps <= ex[1] for ex in exon_ranges]) >= 1
+                for ps in sel_peak_summits
+            ]
+        else:
+            peak_in_exons = [False for pi in sel_peaks]
+
+        # if sum(peak_in_exons) > 0:
+        #     print ('exon / peak overlap found!')
+        #     print(ri, peak_in_exons)
+        # if peaks then this is take them into account, one by one
+        for pi, peak_in_exon in zip(sel_peaks, peak_in_exons):
+            # the current peak is part of an exon
+            # if peak_in_exon:
+            #     print(pi)
+            summit_peak = int((int(pi[2]) + int(pi[1])) / 2)
+            distance = np.abs(tss_summit_start - summit_peak)
+            # print(pi, distance, Sg(distance / decay))
+            wi += Sg(distance / decay) if not peak_in_exon else 1.0
+
+        if log_each is not None and len(weights) % log_each == 0:
+            if len(weights) > 0:
+                print(
+                    "# weights calculated so far",
+                    len(weights),
+                    "out of",
+                    tss_to_peaks.shape[0],
+                )
+
+        weights.append(wi)
+
+    out = tss_to_peaks.copy()
+    out["weight"] = weights
+
+    gene_peak_weight = scipy.sparse.csr_matrix(
+        (
+            out.weight.values,
+            (out.thickEnd.astype("int32").values, out.name.values),
+        ),
+        shape=(adata.shape[1], adata.uns["mode2_var"].shape[0]),
+    )
+
+    adata.obsm["gene_score"] = adata.obsm["mode2"] @ gene_peak_weight.T
+
+
+@method(
+    method_name="RP_simple",
+    paper_name="""Integrative analyses of single-cell transcriptome\
+and regulome using MAESTRO.""",
+    paper_url="https://pubmed.ncbi.nlm.nih.gov/32767996",
+    paper_year=2020,
+    code_version="1.0",
+    code_url="https://github.com/liulab-dfci/MAESTRO",
+    image="openproblems-python-extras",
+)
+def rp_simple(adata, n_top_genes=2000):
+    from .beta import _atac_genes_score
+
+    adata = _atac_genes_score(
+        adata,
+        top_genes=n_top_genes,
+        method="rp_simple",
+    )
+    return adata
+
+
+@method(
+    method_name="RP_enhanced",
+    paper_name="""Integrative analyses of single-cell transcriptome\
+and regulome using MAESTRO.""",
+    paper_url="https://pubmed.ncbi.nlm.nih.gov/32767996",
+    paper_year=2020,
+    code_version="1.0",
+    code_url="https://github.com/liulab-dfci/MAESTRO",
+    image="openproblems-python-extras",
+)
+def rp_enhanced(adata, n_top_genes=2000):
+    from .beta import _atac_genes_score
+
+    adata = _atac_genes_score(adata, top_genes=n_top_genes, method="rp_enhanced")
+    return adata