Skip to content

Commit 4aa09b9

Browse files
MuhammedHasanMuhammed Hasan Celik
andauthored
Refactor vep (#9)
* vep added * vep implemented * vep logs and api update * decima vep bug fix * bug fix * tangermeme version * ci * attr window bug * fix of typo and depreceated * mutate function deleted and fix of string literal --------- Co-authored-by: Muhammed Hasan Celik <celik.muhammed_hasan@gene.com>
1 parent 44d730b commit 4aa09b9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+6505
-155
lines changed

.github/workflows/run-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
runs-on: ubuntu-latest
1111
strategy:
1212
matrix:
13-
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
13+
python-version: ["3.9", "3.10", "3.11", "3.12"]
1414

1515
name: Python ${{ matrix.python-version }}
1616
steps:

setup.cfg

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ python_requires = >=3.9
5050
install_requires =
5151
importlib-metadata; python_version<"3.8"
5252
click
53-
wandb # TODO: move to optional
53+
more_itertools
54+
wandb
5455
numpy
5556
torch
5657
grelu
@@ -65,6 +66,8 @@ install_requires =
6566
anndata
6667
h5py
6768
pyBigWig
69+
pyarrow
70+
tangermeme<0.5
6871

6972
[options.packages.find]
7073
where = src
@@ -76,7 +79,7 @@ exclude =
7679
# `pip install decima[PDF]` like:
7780
# PDF = ReportLab; RXP
7881
optional =
79-
wandb
82+
cyvcf2
8083

8184
# Add here test requirements (semicolon/line-separated)
8285
testing =

src/decima/cli/__init__.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import logging
22
import click
33

4-
# from decima.cli.finetune import finetune
5-
from decima.cli.predict_genes import predict_genes
6-
from decima.cli.download import download
7-
from decima.cli.attributions import attributions
8-
from decima.cli.query_cell import query_cell
4+
from decima.cli.predict_genes import cli_predict_genes
5+
from decima.cli.download import cli_download
6+
from decima.cli.attributions import cli_attributions
7+
from decima.cli.query_cell import cli_query_cell
8+
from decima.cli.vep import cli_predict_variant_effect
9+
# from decima.cli.finetune import cli_finetune
910

1011

1112
logger = logging.getLogger("decima")
@@ -25,11 +26,12 @@ def main():
2526
pass
2627

2728

28-
# main.add_command(finetune)
29-
main.add_command(predict_genes)
30-
main.add_command(download)
31-
main.add_command(attributions)
32-
main.add_command(query_cell)
29+
# main.add_command(cli_finetune, name="finetune")
30+
main.add_command(cli_predict_genes, name="predict-genes")
31+
main.add_command(cli_download, name="download")
32+
main.add_command(cli_attributions, name="attributions")
33+
main.add_command(cli_query_cell, name="query-cell")
34+
main.add_command(cli_predict_variant_effect, name="vep")
3335

3436
if __name__ == "__main__":
3537
main()

src/decima/cli/attributions.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
default=0,
2323
help="Model to use for attribution analysis either replicate number or path to the model.",
2424
)
25+
@click.option(
26+
"--metadata",
27+
type=click.Path(exists=True),
28+
default=None,
29+
help="Path to the metadata anndata file. Default: None.",
30+
)
2531
@click.option(
2632
"--method", type=str, required=False, default="inputxgradient", help="Method to use for attribution analysis."
2733
)
@@ -30,8 +36,20 @@
3036
@click.option("--plot_seqlogo", is_flag=True, help="Generate sequence logo plots for peaks")
3137
@click.option("--seqlogo_window", type=int, default=50, help="Window size for sequence logo plots")
3238
@click.option("--dpi", type=int, default=100, help="DPI for attribution plots")
33-
def attributions(
34-
output_dir, genes, seqs, tasks, off_tasks, model, method, device, plot_peaks, plot_seqlogo, seqlogo_window, dpi
39+
def cli_attributions(
40+
output_dir,
41+
genes,
42+
seqs,
43+
tasks,
44+
off_tasks,
45+
model,
46+
metadata,
47+
method,
48+
device,
49+
plot_peaks,
50+
plot_seqlogo,
51+
seqlogo_window,
52+
dpi,
3553
):
3654
"""
3755
Generate and save attribution analysis results for a gene or a set of sequences.
@@ -80,6 +98,7 @@ def attributions(
8098
tasks=tasks,
8199
off_tasks=off_tasks,
82100
model=model,
101+
metadata_anndata=metadata,
83102
method=method,
84103
device=device,
85104
plot_peaks=plot_peaks,

src/decima/cli/download.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33

44

55
@click.command()
6-
def download():
6+
def cli_download():
77
"""Download all required data and model weights."""
88
download_decima_data()

src/decima/cli/finetune.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
@click.option("--grad", required=True, type=int, help="Gradient accumulation steps")
1717
@click.option("--replicate", default=0, type=int, help="Replication number")
1818
@click.option("--bs", default=4, type=int, help="Batch size")
19-
def finetune(name, dir, lr, weight, grad, replicate, bs):
19+
def cli_finetune(name, dir, lr, weight, grad, replicate, bs):
2020
"""Finetune the Decima model."""
2121
wandb.login(host="https://genentech.wandb.io")
2222
run = wandb.init(project="decima", dir=name, name=name)

src/decima/cli/predict_genes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
@click.option("--matrix_file", required=True, help="Path to h5ad file containing genes to predict.")
2121
@click.option("--out_file", required=True, help="Output file path.")
2222
@click.option("--max_seq_shift", default=0, help="Maximum jitter for augmentation.")
23-
def predict_genes(device, ckpts, h5_file, matrix_file, out_file, max_seq_shift):
23+
def cli_predict_genes(device, ckpts, h5_file, matrix_file, out_file, max_seq_shift):
2424
"""Make predictions for all genes."""
2525
torch.set_float32_matmul_precision("medium")
2626

src/decima/cli/query_cell.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
@click.command()
66
@click.argument("query", default="")
7-
def query_cell(query=""):
7+
def cli_query_cell(query=""):
88
"""
9-
Query a cell using query strig
9+
Query a cell using query string
1010
1111
Examples:
1212

src/decima/cli/vep.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import click
2+
from decima.constants import DECIMA_CONTEXT_SIZE
3+
from decima.vep import predict_variant_effect
4+
5+
6+
@click.command()
7+
@click.option(
8+
"-v",
9+
"--variants",
10+
type=click.Path(exists=True),
11+
help="Path to the variant file .vcf file. VCF file need to be normalized. Try normalizing th vcf file incase of an error. `bcftools norm -f ref.fasta input.vcf.gz -o output.vcf.gz`",
12+
)
13+
@click.option("-o", "--output_pq", type=click.Path(), help="Path to the output parquet file.")
14+
@click.option("--tasks", type=str, default=None, help="Tasks to predict. If not provided, all tasks will be predicted.")
15+
@click.option(
16+
"--chunksize",
17+
type=int,
18+
default=10_000,
19+
help="Number of variants to process in each chunk. Loading variants in chunks is more memory efficient."
20+
"This chuck of variants will be process and saved to output parquet file before contineus to next chunk. Default: 10_000.",
21+
)
22+
@click.option(
23+
"--model",
24+
type=str,
25+
default="0",
26+
help="Model to use for variant effect prediction either replicate number or path to the model.",
27+
)
28+
@click.option(
29+
"--metadata",
30+
type=click.Path(exists=True),
31+
default=None,
32+
help="Path to the metadata anndata file. Default: None.",
33+
)
34+
@click.option(
35+
"--device", type=str, default=None, help="Device to use. Default: None which automatically selects the best device."
36+
)
37+
@click.option("--batch-size", type=int, default=8, help="Batch size for the model. Default: 8")
38+
@click.option("--num-workers", type=int, default=4, help="Number of workers for the loader. Default: 4")
39+
@click.option("--distance-type", type=str, default="tss", help="Type of distance. Default: tss.")
40+
@click.option(
41+
"--min-distance",
42+
type=float,
43+
default=0,
44+
help="Minimum distance from the end of the gene. Default: 0.",
45+
)
46+
@click.option(
47+
"--max-distance",
48+
type=float,
49+
default=DECIMA_CONTEXT_SIZE,
50+
help=f"Maximum distance from the TSS. Default: {DECIMA_CONTEXT_SIZE}.",
51+
)
52+
@click.option(
53+
"--include-cols",
54+
type=str,
55+
default=None,
56+
help="Columns to include in the output in the original tsv file to include in the output parquet file. Default: None.",
57+
)
58+
@click.option(
59+
"--gene-col",
60+
type=str,
61+
default=None,
62+
help="Column name for gene names. Default: None.",
63+
)
64+
@click.option("--genome", type=str, default="hg38", help="Genome build. Default: hg38.")
65+
def cli_predict_variant_effect(
66+
variants,
67+
output_pq,
68+
tasks,
69+
chunksize,
70+
model,
71+
metadata,
72+
device,
73+
batch_size,
74+
num_workers,
75+
distance_type,
76+
min_distance,
77+
max_distance,
78+
include_cols,
79+
gene_col,
80+
genome,
81+
):
82+
"""Predict variant effect and save to parquet
83+
84+
Examples:
85+
86+
>>> decima vep -v "data/sample.vcf" -o "vep_results.parquet"
87+
88+
>>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --tasks "cell_type == 'classical monocyte'" # only predict for classical monocytes
89+
90+
>>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --device 0 # use device gpu device 0
91+
92+
>>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --include-cols "gene_name,gene_id" # include gene_name and gene_id columns in the output
93+
94+
>>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --gene-col "gene_name" # use gene_name column as gene names if these option passed genes and variants mapped based on these column not based on the genomic locus based on the annotaiton.
95+
96+
>>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --distance-type tss --min-distance 50000 --max-distance 100000 # predict for variants within 50kb of the TSS and 100kb of the TSS
97+
"""
98+
if model in ["0", "1", "2", "3"]: # replicate index
99+
model = int(model)
100+
101+
if isinstance(device, str) and device.isdigit():
102+
device = int(device)
103+
104+
if include_cols:
105+
include_cols = include_cols.split(",")
106+
107+
predict_variant_effect(
108+
variants,
109+
output_pq=output_pq,
110+
tasks=tasks,
111+
model=model,
112+
metadata_anndata=metadata,
113+
chunksize=chunksize,
114+
device=device,
115+
batch_size=batch_size,
116+
num_workers=num_workers,
117+
distance_type=distance_type,
118+
min_distance=min_distance,
119+
max_distance=max_distance,
120+
include_cols=include_cols,
121+
gene_col=gene_col,
122+
genome=genome,
123+
)

src/decima/core/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
from decima.core.result import DecimaResult

0 commit comments

Comments
 (0)