|
| 1 | +import click |
| 2 | +from decima.constants import DECIMA_CONTEXT_SIZE |
| 3 | +from decima.vep import predict_variant_effect |
| 4 | + |
| 5 | + |
| 6 | +@click.command() |
| 7 | +@click.option( |
| 8 | + "-v", |
| 9 | + "--variants", |
| 10 | + type=click.Path(exists=True), |
| 11 | + help="Path to the variant file .vcf file. VCF file need to be normalized. Try normalizing th vcf file incase of an error. `bcftools norm -f ref.fasta input.vcf.gz -o output.vcf.gz`", |
| 12 | +) |
| 13 | +@click.option("-o", "--output_pq", type=click.Path(), help="Path to the output parquet file.") |
| 14 | +@click.option("--tasks", type=str, default=None, help="Tasks to predict. If not provided, all tasks will be predicted.") |
| 15 | +@click.option( |
| 16 | + "--chunksize", |
| 17 | + type=int, |
| 18 | + default=10_000, |
| 19 | + help="Number of variants to process in each chunk. Loading variants in chunks is more memory efficient." |
| 20 | + "This chuck of variants will be process and saved to output parquet file before contineus to next chunk. Default: 10_000.", |
| 21 | +) |
| 22 | +@click.option( |
| 23 | + "--model", |
| 24 | + type=str, |
| 25 | + default="0", |
| 26 | + help="Model to use for variant effect prediction either replicate number or path to the model.", |
| 27 | +) |
| 28 | +@click.option( |
| 29 | + "--metadata", |
| 30 | + type=click.Path(exists=True), |
| 31 | + default=None, |
| 32 | + help="Path to the metadata anndata file. Default: None.", |
| 33 | +) |
| 34 | +@click.option( |
| 35 | + "--device", type=str, default=None, help="Device to use. Default: None which automatically selects the best device." |
| 36 | +) |
| 37 | +@click.option("--batch-size", type=int, default=8, help="Batch size for the model. Default: 8") |
| 38 | +@click.option("--num-workers", type=int, default=4, help="Number of workers for the loader. Default: 4") |
| 39 | +@click.option("--distance-type", type=str, default="tss", help="Type of distance. Default: tss.") |
| 40 | +@click.option( |
| 41 | + "--min-distance", |
| 42 | + type=float, |
| 43 | + default=0, |
| 44 | + help="Minimum distance from the end of the gene. Default: 0.", |
| 45 | +) |
| 46 | +@click.option( |
| 47 | + "--max-distance", |
| 48 | + type=float, |
| 49 | + default=DECIMA_CONTEXT_SIZE, |
| 50 | + help=f"Maximum distance from the TSS. Default: {DECIMA_CONTEXT_SIZE}.", |
| 51 | +) |
| 52 | +@click.option( |
| 53 | + "--include-cols", |
| 54 | + type=str, |
| 55 | + default=None, |
| 56 | + help="Columns to include in the output in the original tsv file to include in the output parquet file. Default: None.", |
| 57 | +) |
| 58 | +@click.option( |
| 59 | + "--gene-col", |
| 60 | + type=str, |
| 61 | + default=None, |
| 62 | + help="Column name for gene names. Default: None.", |
| 63 | +) |
| 64 | +@click.option("--genome", type=str, default="hg38", help="Genome build. Default: hg38.") |
| 65 | +def cli_predict_variant_effect( |
| 66 | + variants, |
| 67 | + output_pq, |
| 68 | + tasks, |
| 69 | + chunksize, |
| 70 | + model, |
| 71 | + metadata, |
| 72 | + device, |
| 73 | + batch_size, |
| 74 | + num_workers, |
| 75 | + distance_type, |
| 76 | + min_distance, |
| 77 | + max_distance, |
| 78 | + include_cols, |
| 79 | + gene_col, |
| 80 | + genome, |
| 81 | +): |
| 82 | + """Predict variant effect and save to parquet |
| 83 | +
|
| 84 | + Examples: |
| 85 | +
|
| 86 | + >>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" |
| 87 | +
|
| 88 | + >>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --tasks "cell_type == 'classical monocyte'" # only predict for classical monocytes |
| 89 | +
|
| 90 | + >>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --device 0 # use device gpu device 0 |
| 91 | +
|
| 92 | + >>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --include-cols "gene_name,gene_id" # include gene_name and gene_id columns in the output |
| 93 | +
|
| 94 | + >>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --gene-col "gene_name" # use gene_name column as gene names if these option passed genes and variants mapped based on these column not based on the genomic locus based on the annotaiton. |
| 95 | +
|
| 96 | + >>> decima vep -v "data/sample.vcf" -o "vep_results.parquet" --distance-type tss --min-distance 50000 --max-distance 100000 # predict for variants within 50kb of the TSS and 100kb of the TSS |
| 97 | + """ |
| 98 | + if model in ["0", "1", "2", "3"]: # replicate index |
| 99 | + model = int(model) |
| 100 | + |
| 101 | + if isinstance(device, str) and device.isdigit(): |
| 102 | + device = int(device) |
| 103 | + |
| 104 | + if include_cols: |
| 105 | + include_cols = include_cols.split(",") |
| 106 | + |
| 107 | + predict_variant_effect( |
| 108 | + variants, |
| 109 | + output_pq=output_pq, |
| 110 | + tasks=tasks, |
| 111 | + model=model, |
| 112 | + metadata_anndata=metadata, |
| 113 | + chunksize=chunksize, |
| 114 | + device=device, |
| 115 | + batch_size=batch_size, |
| 116 | + num_workers=num_workers, |
| 117 | + distance_type=distance_type, |
| 118 | + min_distance=min_distance, |
| 119 | + max_distance=max_distance, |
| 120 | + include_cols=include_cols, |
| 121 | + gene_col=gene_col, |
| 122 | + genome=genome, |
| 123 | + ) |
0 commit comments