@@ -31,7 +31,10 @@ class VcfAfColumnsError(Exception):
3131def _yield_expression_af_batches (
3232 vcf : pysam .VariantFile , batch_size : int = 1000
3333) -> Iterator [list [tuple [str , AfData ]]]:
34- """Generate a variant expression-allele frequency data pairing, one at a time
34+ """Generate batches of tuples of (variant expression, allele frequency data).
35+
36+ Operates lazily so only one batch is in memory at a time. If a VCF record has
37+ multiple alternate alleles, each is returned as a separate item.
3538
3639 :param vcf: VCF to pull variants from
3740 :param batch_size: size of return batches
@@ -98,6 +101,8 @@ def ingest_vcf(
98101 for batch in _yield_expression_af_batches (vcf ):
99102 expressions , afs = zip (* batch , strict = True )
100103 variant_ids = av .put_allele_expressions (expressions , assembly )
104+
105+ cafs = []
101106 for variant_id , af in zip (variant_ids , afs , strict = True ):
102107 if variant_id is None :
103108 continue
@@ -114,4 +119,6 @@ def ingest_vcf(
114119 ),
115120 cohort = StudyGroup (name = "rare disease" ),
116121 )
117- storage .add_allele_frequencies (caf )
122+ cafs .append (caf )
123+
124+ storage .add_allele_frequencies (cafs )
0 commit comments