Skip to content

Commit d8ee952

Browse files
committed
ADD reference_genome parameter to CLI
1 parent 1ffd061 commit d8ee952

File tree

2 files changed

+26
-12
lines changed

2 files changed

+26
-12
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,14 @@ In the GUI version of gnomAD Python API, [Streamlit](https://www.streamlit.io/)
7272
| -search_by | *It defines the input.* | Type a gene/transcript identifier <br> *e.g.: TP53, ENSG00000169174, ENST00000544455* <br> Type the name of file containig your inputs <br> *e.g: myGenes.txt*
7373
| -dataset | *It defines the dataset.* | `exac`, `gnomad_r2_1`, `gnomad_r3`, `gnomad_r2_1_controls`, `gnomad_r2_1_non_neuro`, `gnomad_r2_1_non_cancer`, or `gnomad_r2_1_non_topmed`
7474
| -sv_dataset | *It defines structural variants dataset.* | `gnomad_sv_r2_1`, `gnomad_sv_r2_1_controls`, or `gnomad_sv_r2_1_non_neuro`
75+
| -reference_genome | *It defines reference genome build.* | `GRCh37` or `GRCh38`
7576
| -h | *It displays the parameters.* | *To get help via script:* `python gnomad_api_cli.py -h`
7677

7778

7879
> ❗ Here, for getting variants, `gnomad_r2_1` and `gnomad_sv_r2_1` are defined as default values for these two `-dataset` and `-sv_dataset` options, respectively.
79-
>
80+
>
81+
>
82+
> ❗ Also, you need to choose `GRCh38` for retrieving variants from the `gnomad_r3` dataset. However, in the `GRCh38` build, structural variants are not available.
8083
8184
## :hash: CLI | Example Usages
8285
- **How to list the variants by gene name or gene id?**
@@ -85,6 +88,10 @@ In the GUI version of gnomAD Python API, [Streamlit](https://www.streamlit.io/)
8588

8689
`python gnomad_api_cli.py -filter_by=gene_name -search_by="BRCA1" -dataset="gnomad_r2_1" -sv_dataset="gnomad_sv_r2_1"`
8790

91+
If you get data from `gnomad_r3`:
92+
93+
`python gnomad_api_cli.py -filter_by=gene_name -search_by="BRCA1" -dataset="gnomad_r3" -reference_genome="GRCh38"`
94+
8895
*For Ensembl gene ID*
8996

9097
`python gnomad_api_cli.py -filter_by=gene_id -search_by="ENSG00000169174" -dataset="gnomad_r2_1" -sv_dataset="gnomad_sv_r2_1"`

gnomad_api_cli.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ def arg_parser():
3131
parser.add_argument("-filter_by", type=str, required=True, default="gene_name", help="Get your variants according to: `gene_name`, `gene_id, `transcript_id` or `rs_id`.")
3232
parser.add_argument("-search_by", type=str, required=True, default="TP53", help="Type your input for searching or type the file name (e.g: myGenes.txt) containing your inputs")
3333
parser.add_argument("-dataset", type=str, required=True, default="gnomad_r2_1", help="Select your dataset: exac, gnomad_r2_1, gnomad_r3, gnomad_r2_1_controls, gnomad_r2_1_non_neuro, gnomad_r2_1_non_cancer, gnomad_r2_1_non_topmed")
34+
parser.add_argument("-reference_genome", type=str, required=False, default="GRCh37", help="Select a proper reference genome build : `GRCh37` or `GRCh38`")
3435
parser.add_argument("-sv_dataset", type=str, required=False, default="gnomad_sv_r2_1", help="Select your structural variants dataset : `gnomad_sv_r2_1`, `gnomad_sv_r2_1_controls` or `gnomad_sv_r2_1_non_neuro`")
35-
# parser.add_argument("-get", nargs="+", default=["gnomad", "clinvar"], help="List your requests comma seperated: `gnomad`, `clinvar`, `gtex_tissue_expression`, `genome_coverage`, `exome_coverage`, `gnomad_constraint`, or `exac_constraint`")
3636
args = parser.parse_args()
3737

3838
# Control the given arguments
@@ -44,24 +44,31 @@ def arg_parser():
4444

4545
if args.filter_by not in ["gene_name", "gene_id", "transcript_id", "rs_id"]:
4646
sys.exit("! Select a proper filter type :\n\t `gene_name`, `gene_id, `transcript_id` or `rs_id`")
47+
48+
if args.reference_genome not in ["GRCh37", "GRCh38"]:
49+
sys.exit("! Select a proper reference genome build :\n\t `GRCh37` or `GRCh38`")
50+
51+
if (args.dataset == "gnomad_r3") and (args.reference_genome == "GRCh37"):
52+
sys.exit("! You need to select `GRCh38` reference genome build for getting data from `gnomad_r3`.")
4753

4854
# Define variables
4955
filter_by = args.filter_by
5056
search_by = args.search_by
5157
dataset = args.dataset
5258
sv_dataset = args.sv_dataset
59+
reference_genome = args.reference_genome
5360

54-
return filter_by, search_by, dataset, sv_dataset
61+
return filter_by, search_by, dataset, sv_dataset, reference_genome
5562

5663
# gnomAD API
5764
end_point = "https://gnomad.broadinstitute.org/api/"
5865

5966
# Main Function
60-
def get_variants_by(filter_by, search_term, dataset, timeout=None):
67+
def get_variants_by(filter_by, search_term, dataset, reference_genome, sv_dataset, timeout=None):
6168

6269
query_for_transcripts = """
6370
{
64-
transcript(transcript_id: "%s") {
71+
transcript(transcript_id: "%s", reference_genome: %s) {
6572
transcript_id,
6673
transcript_version,
6774
gene {
@@ -430,7 +437,7 @@ def get_variants_by(filter_by, search_term, dataset, timeout=None):
430437

431438
query_for_genes = """
432439
{
433-
gene(%s: "%s") {
440+
gene(%s: "%s", reference_genome: %s) {
434441
gene_id
435442
symbol
436443
start
@@ -612,16 +619,16 @@ def get_variants_by(filter_by, search_term, dataset, timeout=None):
612619
"""
613620

614621
if filter_by == "transcript_id":
615-
query = query_for_transcripts % (search_term.upper(), dataset, dataset)
622+
query = query_for_transcripts % (search_term.upper(), reference_genome, dataset, dataset)
616623

617624
elif filter_by == "rs_id":
618625
query = query_for_variants % ("rsid", search_term.lower(), dataset)
619626

620627
elif filter_by == "gene_id":
621-
query = query_for_genes % ("gene_id", search_term.upper(), sv_dataset, dataset, dataset)
628+
query = query_for_genes % ("gene_id", search_term.upper(), reference_genome, sv_dataset, dataset, dataset)
622629

623630
elif filter_by == "gene_name":
624-
query = query_for_genes % ("gene_name", search_term.upper(), sv_dataset, dataset, dataset)
631+
query = query_for_genes % ("gene_name", search_term.upper(), reference_genome, sv_dataset, dataset, dataset)
625632

626633
else:
627634
print("Unknown `filter_by` type!")
@@ -759,18 +766,18 @@ def get_variants_by(filter_by, search_term, dataset, timeout=None):
759766

760767
# Action
761768
if __name__ == "__main__":
762-
filter_by, search_by, dataset, sv_dataset = arg_parser()
769+
filter_by, search_by, dataset, sv_dataset, reference_genome = arg_parser()
763770
if "." in search_by:
764771
try:
765772
with open(search_by, "r") as f:
766773
search_list = [line.rstrip() for line in f]
767774
for search_item in tqdm(search_list):
768-
get_variants_by(filter_by, search_item, dataset)
775+
get_variants_by(filter_by, search_item, dataset, reference_genome, sv_dataset)
769776
except:
770777
print("A problem occured while reading the file namely `{}` or the filter type `{}` is wrong!"\
771778
.format(search_by, filter_by))
772779
finally:
773780
f.close()
774781
elif "." not in search_by:
775-
get_variants_by(filter_by, search_by, dataset)
782+
get_variants_by(filter_by, search_by, dataset, reference_genome, sv_dataset)
776783

0 commit comments

Comments
 (0)