|
| 1 | + |
| 2 | +def pic_count(fragments, peak_bed, extend:int=5): |
| 3 | + import os |
| 4 | + import numpy as np |
| 5 | + from scipy.sparse import csr_matrix |
| 6 | + import pandas as pd |
| 7 | + import pyranges |
| 8 | + import anndata |
| 9 | + df = pd.read_csv(fragments, sep="\t", header=None, comment="#").iloc[:, range(4)] |
| 10 | + df.columns = ["seqnames", "start", "end", "barcode"] |
| 11 | + bc = pd.Index(pd.unique(df["barcode"])) |
| 12 | + lhs_extend = int((extend - 1) // 2 + (extend - 1) % 2) |
| 13 | + rhs_extend = int((extend - 1) // 2) + 1 |
| 14 | + pf = pd.read_csv(peak_bed, sep="\t", header=None).iloc[:, range(3)] |
| 15 | + pf.columns = ["seqnames", "start", "end"] |
| 16 | + pf["interval"] = ["%s:%d-%d" % (seqnames, start, end) for seqnames, start, end in zip(pf.seqnames, pf.start, pf.end)] |
| 17 | + pf.index = pf["interval"].values |
| 18 | + peaks = pyranges.from_dict({"Chromosome": pf["seqnames"].values, |
| 19 | + "Start": pf["start"].values, |
| 20 | + "End": pf["end"].values, |
| 21 | + "peak_index": np.arange(pf.shape[0])}) |
| 22 | + left = pyranges.from_dict({"Chromosome": df["seqnames"].values, |
| 23 | + "Start": df["start"].values - lhs_extend, |
| 24 | + "End": df["start"].values + rhs_extend, |
| 25 | + "frag_index": np.arange(df.shape[0])}) |
| 26 | + right = pyranges.from_dict({"Chromosome": df["seqnames"].values, |
| 27 | + "Start": df["end"].values - lhs_extend - 1, ### End is non-inclusive |
| 28 | + "End": df["end"].values + rhs_extend - 1, |
| 29 | + "frag_index": np.arange(df.shape[0])}) |
| 30 | + ovp_s = left.join(peaks, how="left", preserve_order=True).df.loc[:, ["frag_index", "peak_index"]].sort_values(["frag_index", "peak_index"]).drop_duplicates("frag_index") |
| 31 | + ovp_e = right.join(peaks, how="left", preserve_order=True).df.loc[:, ["frag_index", "peak_index"]].sort_values(["frag_index", "peak_index"]).drop_duplicates("frag_index") |
| 32 | + del left, right, peaks |
| 33 | + ovp = pd.DataFrame({"frag_index": np.arange(df.shape[0]), "peak_index_s": -1, "peak_index_e": -1}) |
| 34 | + ovp.loc[ovp_s["frag_index"].values, "peak_index_s"] = ovp_s["peak_index"].values |
| 35 | + ovp.loc[ovp_e["frag_index"].values, "peak_index_e"] = ovp_e["peak_index"].values |
| 36 | + ovp["barcode_index"] = bc.get_indexer(df["barcode"].values[ovp["frag_index"].values]) |
| 37 | + del ovp_s, ovp_e |
| 38 | + pic = pd.concat((ovp.loc[ovp["peak_index_s"] != ovp["peak_index_e"], ["barcode_index", "peak_index_e"]].rename({"peak_index_e": "peak_index"}, axis=1), |
| 39 | + ovp.loc[:, ["barcode_index", "peak_index_s"]].rename({"peak_index_s": "peak_index"}, axis=1))) |
| 40 | + pic = pic.loc[pic["peak_index"] >= 0, :] |
| 41 | + S = csr_matrix((np.ones(pic.shape[0], dtype=np.int16), (pic["barcode_index"].values, pic["peak_index"].values)), |
| 42 | + shape=(len(bc), pf.shape[0]), dtype=np.int16) |
| 43 | + S.sum_duplicates() |
| 44 | + uns = {"files": {"fragments": os.path.abspath(fragments)}} |
| 45 | + return anndata.AnnData(S, obs=pd.DataFrame(index=bc), var=pf, uns=uns) |
| 46 | + |
| 47 | +def pic_create_h5ad(fragments, peak_bed, sample, h5ad, |
| 48 | + tss_bed:str=None, |
| 49 | + blacklist_bed:str=None, |
| 50 | + compression:int=6, |
| 51 | + sample_name="Sample", |
| 52 | + extend:int=5, |
| 53 | + promoter_upstream:int=2000, |
| 54 | + promoter_downstream:int=100): |
| 55 | + import pandas as pd |
| 56 | + import scanpy as sc |
| 57 | + adata = pic_count(fragments=fragments, peak_bed=peak_bed, extend=extend) |
| 58 | + qc_vars = [] |
| 59 | + if blacklist_bed is not None: |
| 60 | + import pyranges |
| 61 | + gr = pyranges.from_dict({"Chromosome": adata.var["seqnames"].values, |
| 62 | + "Start": adata.var["start"].values, |
| 63 | + "End": adata.var["end"].values, |
| 64 | + "interval": adata.var_names}) |
| 65 | + bl = pd.read_csv(blacklist_bed, sep="\t", header=None) |
| 66 | + bl = pyranges.from_dict({"Chromosome": bl[0].values, |
| 67 | + "Start": bl[1].values, |
| 68 | + "End": bl[2].values}) |
| 69 | + bl_peaks = pd.unique(gr.join(bl).df["interval"]) |
| 70 | + adata.var["non_blacklist"] = ~adata.var_names.isin(bl_peaks) |
| 71 | + qc_vars.append("non_blacklist") |
| 72 | + if tss_bed is not None: |
| 73 | + import pyranges |
| 74 | + from muon import atac as ac |
| 75 | + from benj.gene_estimation import get_tss |
| 76 | + tss = benj.get_tss(tss_bed).rename({"left": "Start", "right": "End"}, axis=1) |
| 77 | + tsse = ac.tl.tss_enrichment(adata, tss, n_tss=tss.shape[0]) |
| 78 | + gr = pyranges.from_dict({"Chromosome": adata.var["seqnames"].values, |
| 79 | + "Start": adata.var["start"].values, |
| 80 | + "End": adata.var["end"].values, |
| 81 | + "interval": adata.var_names}) |
| 82 | + tr = pyranges.from_dict({"Chromosome": tss["Chromosome"].values, |
| 83 | + "Start": tss["Start"].values, |
| 84 | + "End": tss["End"].values}) |
| 85 | + tss_peaks = pd.unique(gr.join(tr).df["interval"]) |
| 86 | + adata.var["overlap_tss"] = adata.var_names.isin(tss_peaks) |
| 87 | + qc_vars.append("overlap_tss") |
| 88 | + ### Promoters |
| 89 | + if "strand" in tss.columns: |
| 90 | + tss["PromStart"] = tss["Start"] - promoter_upstream |
| 91 | + tss["PromEnd"] = tss["End"] + promoter_downstream |
| 92 | + tss.loc[tss["strand"] == "-", "PromStart"] = tss["Start"] - promoter_downstream |
| 93 | + tss.loc[tss["strand"] == "-", "PromEnd"] = tss["End"] + promoter_upstream |
| 94 | + pr = pyranges.from_dict({"Chromosome": tss["Chromosome"].values, |
| 95 | + "Start": tss["PromStart"].values, |
| 96 | + "End": tss["PromEnd"].values}) |
| 97 | + prom_peaks = pd.unique(gr.join(pr).df["interval"]) |
| 98 | + adata.var["overlap_promoter"] = adata.var_names.isin(prom_peaks) |
| 99 | + qc_vars.append("overlap_promoter") |
| 100 | + sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, inpalce=True, percent_top=None) |
| 101 | + if sample is not None: |
| 102 | + adata.obs[sample_name] = sample |
| 103 | + adata.obs_names = [f"{sample}#{bc}" for bc in adata.obs_names] |
| 104 | + adata.write_h5ad(h5ad, compression="gzip", compression_opts=compression) |
| 105 | + |
0 commit comments