Skip to content

Commit f0770f6

Browse files
added Paired Insertion Count matrix
1 parent 965db87 commit f0770f6

File tree

1 file changed

+105
-0
lines changed

1 file changed

+105
-0
lines changed

benj/pic.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
2+
def pic_count(fragments, peak_bed, extend:int=5):
3+
import os
4+
import numpy as np
5+
from scipy.sparse import csr_matrix
6+
import pandas as pd
7+
import pyranges
8+
import anndata
9+
df = pd.read_csv(fragments, sep="\t", header=None, comment="#").iloc[:, range(4)]
10+
df.columns = ["seqnames", "start", "end", "barcode"]
11+
bc = pd.Index(pd.unique(df["barcode"]))
12+
lhs_extend = int((extend - 1) // 2 + (extend - 1) % 2)
13+
rhs_extend = int((extend - 1) // 2) + 1
14+
pf = pd.read_csv(peak_bed, sep="\t", header=None).iloc[:, range(3)]
15+
pf.columns = ["seqnames", "start", "end"]
16+
pf["interval"] = ["%s:%d-%d" % (seqnames, start, end) for seqnames, start, end in zip(pf.seqnames, pf.start, pf.end)]
17+
pf.index = pf["interval"].values
18+
peaks = pyranges.from_dict({"Chromosome": pf["seqnames"].values,
19+
"Start": pf["start"].values,
20+
"End": pf["end"].values,
21+
"peak_index": np.arange(pf.shape[0])})
22+
left = pyranges.from_dict({"Chromosome": df["seqnames"].values,
23+
"Start": df["start"].values - lhs_extend,
24+
"End": df["start"].values + rhs_extend,
25+
"frag_index": np.arange(df.shape[0])})
26+
right = pyranges.from_dict({"Chromosome": df["seqnames"].values,
27+
"Start": df["end"].values - lhs_extend - 1, ### End is non-inclusive
28+
"End": df["end"].values + rhs_extend - 1,
29+
"frag_index": np.arange(df.shape[0])})
30+
ovp_s = left.join(peaks, how="left", preserve_order=True).df.loc[:, ["frag_index", "peak_index"]].sort_values(["frag_index", "peak_index"]).drop_duplicates("frag_index")
31+
ovp_e = right.join(peaks, how="left", preserve_order=True).df.loc[:, ["frag_index", "peak_index"]].sort_values(["frag_index", "peak_index"]).drop_duplicates("frag_index")
32+
del left, right, peaks
33+
ovp = pd.DataFrame({"frag_index": np.arange(df.shape[0]), "peak_index_s": -1, "peak_index_e": -1})
34+
ovp.loc[ovp_s["frag_index"].values, "peak_index_s"] = ovp_s["peak_index"].values
35+
ovp.loc[ovp_e["frag_index"].values, "peak_index_e"] = ovp_e["peak_index"].values
36+
ovp["barcode_index"] = bc.get_indexer(df["barcode"].values[ovp["frag_index"].values])
37+
del ovp_s, ovp_e
38+
pic = pd.concat((ovp.loc[ovp["peak_index_s"] != ovp["peak_index_e"], ["barcode_index", "peak_index_e"]].rename({"peak_index_e": "peak_index"}, axis=1),
39+
ovp.loc[:, ["barcode_index", "peak_index_s"]].rename({"peak_index_s": "peak_index"}, axis=1)))
40+
pic = pic.loc[pic["peak_index"] >= 0, :]
41+
S = csr_matrix((np.ones(pic.shape[0], dtype=np.int16), (pic["barcode_index"].values, pic["peak_index"].values)),
42+
shape=(len(bc), pf.shape[0]), dtype=np.int16)
43+
S.sum_duplicates()
44+
uns = {"files": {"fragments": os.path.abspath(fragments)}}
45+
return anndata.AnnData(S, obs=pd.DataFrame(index=bc), var=pf, uns=uns)
46+
47+
def pic_create_h5ad(fragments, peak_bed, sample, h5ad,
48+
tss_bed:str=None,
49+
blacklist_bed:str=None,
50+
compression:int=6,
51+
sample_name="Sample",
52+
extend:int=5,
53+
promoter_upstream:int=2000,
54+
promoter_downstream:int=100):
55+
import pandas as pd
56+
import scanpy as sc
57+
adata = pic_count(fragments=fragments, peak_bed=peak_bed, extend=extend)
58+
qc_vars = []
59+
if blacklist_bed is not None:
60+
import pyranges
61+
gr = pyranges.from_dict({"Chromosome": adata.var["seqnames"].values,
62+
"Start": adata.var["start"].values,
63+
"End": adata.var["end"].values,
64+
"interval": adata.var_names})
65+
bl = pd.read_csv(blacklist_bed, sep="\t", header=None)
66+
bl = pyranges.from_dict({"Chromosome": bl[0].values,
67+
"Start": bl[1].values,
68+
"End": bl[2].values})
69+
bl_peaks = pd.unique(gr.join(bl).df["interval"])
70+
adata.var["non_blacklist"] = ~adata.var_names.isin(bl_peaks)
71+
qc_vars.append("non_blacklist")
72+
if tss_bed is not None:
73+
import pyranges
74+
from muon import atac as ac
75+
from benj.gene_estimation import get_tss
76+
tss = benj.get_tss(tss_bed).rename({"left": "Start", "right": "End"}, axis=1)
77+
tsse = ac.tl.tss_enrichment(adata, tss, n_tss=tss.shape[0])
78+
gr = pyranges.from_dict({"Chromosome": adata.var["seqnames"].values,
79+
"Start": adata.var["start"].values,
80+
"End": adata.var["end"].values,
81+
"interval": adata.var_names})
82+
tr = pyranges.from_dict({"Chromosome": tss["Chromosome"].values,
83+
"Start": tss["Start"].values,
84+
"End": tss["End"].values})
85+
tss_peaks = pd.unique(gr.join(tr).df["interval"])
86+
adata.var["overlap_tss"] = adata.var_names.isin(tss_peaks)
87+
qc_vars.append("overlap_tss")
88+
### Promoters
89+
if "strand" in tss.columns:
90+
tss["PromStart"] = tss["Start"] - promoter_upstream
91+
tss["PromEnd"] = tss["End"] + promoter_downstream
92+
tss.loc[tss["strand"] == "-", "PromStart"] = tss["Start"] - promoter_downstream
93+
tss.loc[tss["strand"] == "-", "PromEnd"] = tss["End"] + promoter_upstream
94+
pr = pyranges.from_dict({"Chromosome": tss["Chromosome"].values,
95+
"Start": tss["PromStart"].values,
96+
"End": tss["PromEnd"].values})
97+
prom_peaks = pd.unique(gr.join(pr).df["interval"])
98+
adata.var["overlap_promoter"] = adata.var_names.isin(prom_peaks)
99+
qc_vars.append("overlap_promoter")
100+
sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, inpalce=True, percent_top=None)
101+
if sample is not None:
102+
adata.obs[sample_name] = sample
103+
adata.obs_names = [f"{sample}#{bc}" for bc in adata.obs_names]
104+
adata.write_h5ad(h5ad, compression="gzip", compression_opts=compression)
105+

0 commit comments

Comments
 (0)