Skip to content

Commit 38b6390

Browse files
committed
incorporate feedback
1 parent d4d62d7 commit 38b6390

File tree

6 files changed

+116
-23
lines changed

6 files changed

+116
-23
lines changed

src/anyvlm/anyvar/base_client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ def put_allele_expressions(
2929
) -> Sequence[str | None]:
3030
"""Submit allele expressions to an AnyVar instance and retrieve corresponding VRS IDs
3131
32+
Currently, only expressions supported by the VRS-Python translator are supported.
33+
This could change depending on the AnyVar implementation, though, and probably
34+
can't be validated on the AnyVLM side.
35+
3236
:param expressions: variation expressions to register
3337
:param assembly: reference assembly used in variation expressions
3438
:return: list where the i'th item is either the VRS ID if translation succeeds,

src/anyvlm/anyvar/http_client.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import requests
88
from anyvar.utils.liftover_utils import ReferenceAssembly
99
from anyvar.utils.types import VrsVariation
10-
from ga4gh.vrs import models
10+
from ga4gh.vrs import VrsType, models
1111

1212
from anyvlm.anyvar.base_client import (
1313
AnyVarClientConnectionError,
@@ -40,19 +40,23 @@ def put_allele_expressions(
4040
) -> Sequence[str | None]:
4141
"""Submit allele expressions to an AnyVar instance and retrieve corresponding VRS IDs
4242
43+
Currently, only expressions supported by the VRS-Python translator are supported.
44+
This could change depending on the AnyVar implementation, though, and probably
45+
can't be validated on the AnyVLM side.
46+
4347
:param expressions: variation expressions to register
4448
:param assembly: reference assembly used in expressions
4549
:return: list where the i'th item is either the VRS ID if translation succeeds,
4650
else `None`, for the i'th expression
4751
:raise AnyVarClientError: for unexpected errors relating to specifics of client interface
4852
"""
4953
results = []
54+
url = f"{self.hostname}/variation"
5055
for expression in expressions:
51-
url = f"{self.hostname}/variation"
5256
payload = {
5357
"definition": expression,
5458
"assembly_name": assembly.value,
55-
"input_type": "Allele",
59+
"input_type": VrsType.ALLELE.value,
5660
}
5761
try:
5862
response = requests.put(

src/anyvlm/anyvar/python_client.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from anyvar.storage.base_storage import Storage
88
from anyvar.translate.translate import TranslationError, Translator
99
from anyvar.utils.liftover_utils import ReferenceAssembly
10-
from anyvar.utils.types import VrsVariation
10+
from anyvar.utils.types import SupportedVariationType, VrsVariation
1111
from ga4gh.vrs.dataproxy import DataProxyValidationError
1212

1313
from anyvlm.anyvar.base_client import BaseAnyVarClient
@@ -33,6 +33,10 @@ def put_allele_expressions(
3333
) -> Sequence[str | None]:
3434
"""Submit allele expressions to an AnyVar instance and retrieve corresponding VRS IDs
3535
36+
Currently, only expressions supported by the VRS-Python translator are supported.
37+
This could change depending on the AnyVar implementation, though, and probably
38+
can't be validated on the AnyVLM side.
39+
3640
:param expressions: variation expressions to register
3741
:param assembly: reference assembly used in expressions
3842
:return: list where the i'th item is either the VRS ID if translation succeeds,
@@ -43,7 +47,9 @@ def put_allele_expressions(
4347
translated_variation = None
4448
try:
4549
translated_variation = self.av.translator.translate_variation(
46-
expression, assembly=assembly.value
50+
expression,
51+
assembly=assembly.value,
52+
input_type=SupportedVariationType.ALLELE,
4753
)
4854
except DataProxyValidationError:
4955
_logger.exception("Found invalid base in expression %s", expression)

src/anyvlm/functions/ingest_vcf.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@
1717
AfData = namedtuple("AfData", ("ac", "an", "ac_het", "ac_hom", "ac_hemi"))
1818

1919

20+
class VcfAfColumnsError(Exception):
21+
"""Raise for missing VCF INFO columns that are required for AF ingestion"""
22+
23+
2024
def _yield_expression_af_batches(
2125
vcf: pysam.VariantFile, batch_size: int = 1000
2226
) -> Iterator[list[tuple[str, CohortAlleleFrequencyStudyResult]]]:
2327
"""Generate a variant expression-allele frequency data pairing, one at a time
2428
2529
:param vcf: VCF to pull variants from
26-
:param translator: VRS-Python variant translator for converting VCF expressions to VRS
27-
:param assembly: name of reference assembly used by VCF
2830
:param batch_size: size of return batches
2931
:return: iterator of lists of pairs of variant expressions and AF data classes
3032
"""
@@ -36,13 +38,19 @@ def _yield_expression_af_batches(
3638
_logger.info("Skipping missing allele at %s", record)
3739
continue
3840
expression = f"{record.chrom}-{record.pos}-{record.ref}-{alt}"
39-
af = AfData(
40-
ac=record.info["AC"][i],
41-
an=record.info["AN"],
42-
ac_het=record.info["AC_Het"][i],
43-
ac_hom=record.info["AC_Hom"][i],
44-
ac_hemi=record.info["AC_Hemi"][i],
45-
)
41+
try:
42+
af = AfData(
43+
ac=record.info["AC"][i],
44+
an=record.info["AN"],
45+
ac_het=record.info["AC_Het"][i],
46+
ac_hom=record.info["AC_Hom"][i],
47+
ac_hemi=record.info["AC_Hemi"][i],
48+
)
49+
except KeyError as e:
50+
info = record.info
51+
msg = f"One or more required INFO column is missing: {'AC' in info}, {'AN' in info}, {'AC_Het' in info}, {'AC_Hom' in info}, {'AC_Hemi' in info}"
52+
_logger.exception(msg)
53+
raise VcfAfColumnsError(msg) from e
4654
batch.append((expression, af))
4755
if len(batch) >= batch_size:
4856
_logger.debug("Yielding next batch")
@@ -72,6 +80,7 @@ def ingest_vcf(
7280
:param vcf_path: location of input file
7381
:param av: AnyVar client
7482
:param assembly: reference assembly used by VCF
83+
:raise VcfAfColumnsError: if VCF is missing required columns
7584
"""
7685
vcf = pysam.VariantFile(filename=vcf_path.absolute().as_uri(), mode="r")
7786

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##5UTR_annotation=Variant annotation from UTRAnnotator
4+
##5UTR_consequence=Variant consequence from UTRAnnotator
5+
##CADD_PHRED=PHRED-like scaled CADD score. CADD is only available here for non-commercial use. See CADD website for more information.
6+
##CADD_RAW=Raw CADD score. CADD is only available here for non-commercial use. See CADD website for more information.
7+
##Existing_InFrame_oORFs=The number of existing inFrame overlapping ORFs (inFrame oORF) at the 5 prime UTR
8+
##Existing_OutOfFrame_oORFs=The number of existing out-of-frame overlapping ORFs (OutOfFrame oORF) at the 5 prime UTR
9+
##Existing_uORFs=The number of existing uORFs with a stop codon within the 5 prime UTR
10+
##FILTER=<ID=EXCESS_ALLELES,Description="Site has an excess of alternate alleles based on the input threshold">
11+
##FILTER=<ID=ExcessHet,Description="Site has excess het value larger than the threshold">
12+
##FILTER=<ID=LowQual,Description="Low quality">
13+
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
14+
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
15+
##FORMAT=<ID=FT,Number=.,Type=String,Description="Genotype-level filter">
16+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
18+
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
19+
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
20+
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
21+
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
22+
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
23+
##INFO=<ID=AC_Hemi,Number=A,Type=Integer,Description="Allele counts in hemizygous genotypes">
24+
##INFO=<ID=AC_Het,Number=A,Type=Integer,Description="Allele counts in heterozygous genotypes">
25+
##INFO=<ID=AC_Hom,Number=A,Type=Integer,Description="Allele counts in homozygous genotypes">
26+
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
27+
##INFO=<ID=AS_QUALapprox,Number=1,Type=String,Description="Allele-specific QUAL approximations">
28+
##INFO=<ID=CALIBRATION_SENSITIVITY,Number=A,Type=String,Description="Calibration sensitivity corresponding to the value of SCORE">
29+
##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|GENE_PHENO|NEAREST|HGVS_OFFSET|AF|CLIN_SIG|SOMATIC|PHENO|REVEL|SpliceRegion|CADD_PHRED|CADD_RAW|5UTR_annotation|5UTR_consequence|Existing_InFrame_oORFs|Existing_OutOfFrame_oORFs|Existing_uORFs|SpliceAI_pred_DP_AG|SpliceAI_pred_DP_AL|SpliceAI_pred_DP_DG|SpliceAI_pred_DP_DL|SpliceAI_pred_DS_AG|SpliceAI_pred_DS_AL|SpliceAI_pred_DS_DG|SpliceAI_pred_DS_DL|SpliceAI_pred_SYMBOL">
30+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
31+
##INFO=<ID=F_MISSING,Number=.,Type=Float,Description="Added by +fill-tags expression F_MISSING=F_MISSING">
32+
##INFO=<ID=OLD_MULTIALLELIC,Number=1,Type=String,Description="Original chr:pos:ref:alt encoding">
33+
##INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding">
34+
##INFO=<ID=QUALapprox,Number=1,Type=Integer,Description="Sum of PL[0] values; used to approximate the QUAL score">
35+
##INFO=<ID=SCORE,Number=A,Type=String,Description="Score according to the model applied by ScoreVariantAnnotations">
36+
##INFO=<ID=TYPE,Number=.,Type=String,Description="Variant type">
37+
##REVEL=Rare Exome Variant Ensemble Learner
38+
##SpliceAI_pred_DP_AG=SpliceAI predicted effect on splicing. Delta position for acceptor gain
39+
##SpliceAI_pred_DP_AL=SpliceAI predicted effect on splicing. Delta position for acceptor loss
40+
##SpliceAI_pred_DP_DG=SpliceAI predicted effect on splicing. Delta position for donor gain
41+
##SpliceAI_pred_DP_DL=SpliceAI predicted effect on splicing. Delta position for donor loss
42+
##SpliceAI_pred_DS_AG=SpliceAI predicted effect on splicing. Delta score for acceptor gain
43+
##SpliceAI_pred_DS_AL=SpliceAI predicted effect on splicing. Delta score for acceptor loss
44+
##SpliceAI_pred_DS_DG=SpliceAI predicted effect on splicing. Delta score for donor gain
45+
##SpliceAI_pred_DS_DL=SpliceAI predicted effect on splicing. Delta score for donor loss
46+
##SpliceAI_pred_SYMBOL=SpliceAI gene symbol
47+
##SpliceRegion=SpliceRegion predictions
48+
##contig=<ID=chr14,length=107043718>
49+
##high_CALIBRATION_SENSITIVITY_INDEL=Sample Genotype FT filter value indicating that the genotyped allele failed INDEL model calibration sensitivity cutoff (0.99)
50+
##high_CALIBRATION_SENSITIVITY_SNP=Sample Genotype FT filter value indicating that the genotyped allele failed SNP model calibration sensitivity cutoff (0.997)
51+
##source=SelectVariants
52+
##INFO=<ID=VRS_Allele_IDs,Number=R,Type=String,Description="The computed identifiers for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
53+
##INFO=<ID=VRS_Error,Number=.,Type=String,Description="If an error occurred computing a VRS Identifier, the error message">
54+
##INFO=<ID=VRS_Starts,Number=R,Type=String,Description="Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
55+
##INFO=<ID=VRS_Ends,Number=R,Type=String,Description="Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
56+
##INFO=<ID=VRS_States,Number=R,Type=String,Description="The literal sequence states used for the GA4GH VRS Alleles corresponding to the GT indexes of the REF and ALT alleles">
57+
#CHROM POS ID REF ALT QUAL FILTER INFO
58+
chr14 18223529 . C A . LowQual;NO_HQ_GENOTYPES AC=1;AC_Hemi=0;AC_Het=1;AC_Hom=0;AF=0.000278707;AS_QUALapprox=0|55;CALIBRATION_SENSITIVITY=.;CSQ=A|intergenic_variant|MODIFIER|||||||||||||||rs1294420531||||||||OR11H12||||||||3.503|0.321010||||||||||||||;F_MISSING=0.23692;QUALapprox=55;SCORE=.;TYPE=SNP;VRS_Allele_IDs=ga4gh:VA.8OSPHYmhyg9hJTpFQ8aNcmLgYMR77ZyJ,ga4gh:VA.slgr2fnRKaUnQrJZvYNDGMrfZHw6QCr6;VRS_Starts=18223528,18223528;VRS_Ends=18223529,18223529;VRS_States=C,A
59+
chr14 18223557 . C T . . AC=2;AC_Hemi=0;AC_Het=2;AC_Hom=0;AF=0.000449236;AS_QUALapprox=0|118;CALIBRATION_SENSITIVITY=0.9951;CSQ=T|intergenic_variant|MODIFIER|||||||||||||||rs201056409||||||||OR11H12||||||||4.169|0.380250||||||||||||||;F_MISSING=0.0531689;QUALapprox=62;SCORE=-0.6499;TYPE=SNP;VRS_Allele_IDs=ga4gh:VA.W2CUxon4uJMb3B7txw-Ok4Kc7L6Y6_U6,ga4gh:VA.6Vh1yfYyljQHm6_qLTKqzi1URy8MfcGe;VRS_Starts=18223556,18223556;VRS_Ends=18223557,18223557;VRS_States=C,T
60+
chr14 18223583 . C G . ExcessHet AC=1268;AC_Hemi=0;AC_Het=1262;AC_Hom=6;AF=0.3916;AS_QUALapprox=0|83543;CALIBRATION_SENSITIVITY=.;CSQ=G|intergenic_variant|MODIFIER|||||||||||||||rs28973059||||||||OR11H12||0.3067||||||3.687|0.337395||||||||||||||;F_MISSING=0.311357;QUALapprox=67;SCORE=.;TYPE=SNP;VRS_Allele_IDs=ga4gh:VA.J2UbTk_frk4EJ884PUZ3q0jyhOhmO6Nb,ga4gh:VA.7RhOJ6GlTAnbiwEcfvl9ZKSzrJl47Emg;VRS_Starts=18223582,18223582;VRS_Ends=18223583,18223583;VRS_States=C,G
61+
chr14 18223586 . T C . LowQual;NO_HQ_GENOTYPES AC=1;AC_Hemi=0;AC_Het=1;AC_Hom=0;AF=0.000214041;AS_QUALapprox=0|50;CALIBRATION_SENSITIVITY=.;CSQ=C|intergenic_variant|MODIFIER|||||||||||||||rs987931840||||||||OR11H12||||||||5.409|0.494757||||||||||||||;F_MISSING=0.00638026;QUALapprox=50;SCORE=.;TYPE=SNP;VRS_Allele_IDs=ga4gh:VA.mp2_nBqnYfP1qh9lVEcstg1ZihTojgJF,ga4gh:VA.srLXVmS7-JU1hLfxMZkkgFMy64GS7D8H;VRS_Starts=18223585,18223585;VRS_Ends=18223586,18223586;VRS_States=T,C
62+
chr14 18223591 . G A . . AC=2;AC_Hemi=0;AC_Het=2;AC_Hom=0;AF=0.0004329;AS_QUALapprox=0|112;CALIBRATION_SENSITIVITY=0.9968;CSQ=A|intergenic_variant|MODIFIER|||||||||||||||rs913251146||||||||OR11H12||||||||3.844|0.351386||||||||||||||;F_MISSING=0.0174394;QUALapprox=65;SCORE=-0.6961;TYPE=SNP;VRS_Allele_IDs=ga4gh:VA.TyLqDjEF7ZqV8OuoIvbbqSlobvRN08j4,ga4gh:VA.1ra1LoRvuvAhbeKl4YgbdrGkXWjc8Lpc;VRS_Starts=18223590,18223590;VRS_Ends=18223591,18223591;VRS_States=G,A

tests/unit/functions/test_ingest_vcf.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from anyvar.utils.types import VrsVariation
77

88
from anyvlm.anyvar.base_client import BaseAnyVarClient
9-
from anyvlm.functions.ingest_vcf import ingest_vcf
9+
from anyvlm.functions.ingest_vcf import VcfAfColumnsError, ingest_vcf
1010

1111

1212
@pytest.fixture(scope="session")
@@ -75,18 +75,26 @@ def input_grch37_vcf_path(test_data_dir: Path) -> Path:
7575
return test_data_dir / "vcf" / "grch37_vcf.vcf"
7676

7777

78-
def test_ingest_vcf_grch38(
79-
input_grch38_vcf_path: Path, stub_anyvar_client: BaseAnyVarClient
80-
):
81-
ingest_vcf(input_grch38_vcf_path, stub_anyvar_client)
78+
def test_ingest_vcf_grch38(test_data_dir: Path, stub_anyvar_client: BaseAnyVarClient):
79+
ingest_vcf(test_data_dir / "vcf" / "grch38_vcf.vcf", stub_anyvar_client)
8280

8381

84-
def test_ingest_vcf_grch37(
85-
input_grch37_vcf_path: Path, stub_anyvar_client: BaseAnyVarClient
86-
):
87-
ingest_vcf(input_grch37_vcf_path, stub_anyvar_client, ReferenceAssembly.GRCH37)
82+
def test_ingest_vcf_grch37(test_data_dir: Path, stub_anyvar_client: BaseAnyVarClient):
83+
ingest_vcf(
84+
test_data_dir / "vcf" / "grch37_vcf.vcf",
85+
stub_anyvar_client,
86+
ReferenceAssembly.GRCH37,
87+
)
8888

8989

9090
def test_ingest_vcf_notfound(stub_anyvar_client: BaseAnyVarClient):
9191
with pytest.raises(FileNotFoundError):
9292
ingest_vcf(Path("file_that_doesnt_exist.vcf"), stub_anyvar_client)
93+
94+
95+
def test_ingest_vcf_infocol_missing(
96+
stub_anyvar_client: BaseAnyVarClient, test_data_dir: Path
97+
):
98+
"""Test smooth handling of VCF that's missing one or more required INFO columns"""
99+
with pytest.raises(VcfAfColumnsError):
100+
ingest_vcf(test_data_dir / "vcf" / "info_field_missing.vcf", stub_anyvar_client)

0 commit comments

Comments
 (0)