Skip to content

Commit c8dc145

Browse files
authored
feat: support for bulk writing allele frequency records (#64)
1 parent fac9e72 commit c8dc145

File tree

6 files changed

+31
-23
lines changed

6 files changed

+31
-23
lines changed

src/anyvlm/functions/ingest_vcf.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,10 @@ class VcfAfColumnsError(Exception):
3131
def _yield_expression_af_batches(
3232
vcf: pysam.VariantFile, batch_size: int = 1000
3333
) -> Iterator[list[tuple[str, AfData]]]:
34-
"""Generate a variant expression-allele frequency data pairing, one at a time
34+
"""Generate batches of tuples of (variant expression, allele frequency data).
35+
36+
Operates lazily so only one batch is in memory at a time. If a VCF record has
37+
multiple alternate alleles, each is returned as a separate item.
3538
3639
:param vcf: VCF to pull variants from
3740
:param batch_size: size of return batches
@@ -98,6 +101,8 @@ def ingest_vcf(
98101
for batch in _yield_expression_af_batches(vcf):
99102
expressions, afs = zip(*batch, strict=True)
100103
variant_ids = av.put_allele_expressions(expressions, assembly)
104+
105+
cafs = []
101106
for variant_id, af in zip(variant_ids, afs, strict=True):
102107
if variant_id is None:
103108
continue
@@ -114,4 +119,6 @@ def ingest_vcf(
114119
),
115120
cohort=StudyGroup(name="rare disease"),
116121
)
117-
storage.add_allele_frequencies(caf)
122+
cafs.append(caf)
123+
124+
storage.add_allele_frequencies(cafs)

src/anyvlm/storage/base_storage.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,12 @@ def sanitized_url(self) -> str:
3030
"""Return a sanitized URL (password masked) of the database connection string."""
3131

3232
@abstractmethod
33-
def add_allele_frequencies(self, caf: AnyVlmCohortAlleleFrequencyResult) -> None:
33+
def add_allele_frequencies(
34+
self, cafs: list[AnyVlmCohortAlleleFrequencyResult]
35+
) -> None:
3436
"""Add allele frequency data to the database. Will skip conflicts.
3537
36-
NOTE: For now, this will only insert a single caf record into the database.
37-
Single insertion is used to do a simple test of the storage backend.
38-
Issue-34 will support batch insertion of caf records.
39-
40-
:param caf: Cohort allele frequency study result object to insert into the DB
38+
:param cafs: List of cohort allele frequency study result objects to insert
4139
"""
4240

4341
@abstractmethod

src/anyvlm/storage/postgres.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,21 @@ def sanitized_url(self) -> str:
5454
netloc += f":{parsed.port}"
5555
return f"{parsed.scheme}://{netloc}{parsed.path}"
5656

57-
def add_allele_frequencies(self, caf: AnyVlmCohortAlleleFrequencyResult) -> None:
57+
def add_allele_frequencies(
58+
self, cafs: list[AnyVlmCohortAlleleFrequencyResult]
59+
) -> None:
5860
"""Add allele frequency data to the database. Will skip conflicts.
5961
60-
NOTE: For now, this will only insert a single caf record into the database.
61-
Single insertion is used to do a simple test of the storage backend.
62-
Issue-34 will support batch insertion of caf records.
63-
64-
:param caf: Cohort allele frequency study result object to insert into the DB
62+
:param cafs: List of cohort allele frequency study result objects to insert
6563
"""
66-
db_entity = mapper_registry.to_db_entity(caf)
64+
if not cafs:
65+
return
66+
67+
db_entities = [mapper_registry.to_db_entity(caf) for caf in cafs]
6768
stmt = insert(orm.AlleleFrequencyData).on_conflict_do_nothing()
6869

6970
with self.session_factory() as session, session.begin():
70-
session.execute(stmt, db_entity.to_dict())
71+
session.execute(stmt, [entity.to_dict() for entity in db_entities])
7172

7273
def get_caf_by_vrs_allele_id(
7374
self, vrs_allele_id: str

tests/integration/functions/test_get_caf.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,11 @@ def populated_postgres_storage(
7575
caf_iri: AnyVlmCohortAlleleFrequencyResult,
7676
):
7777
"""Populate the postgres storage with allele frequencies for testing"""
78-
for allele in alleles.values():
79-
caf = build_caf(caf_iri, allele_id=allele["variation"]["id"])
80-
postgres_storage.add_allele_frequencies(caf)
78+
cafs = [
79+
build_caf(caf_iri, allele_id=allele["variation"]["id"])
80+
for allele in alleles.values()
81+
]
82+
postgres_storage.add_allele_frequencies(cafs)
8183
return postgres_storage
8284

8385

tests/integration/storage/test_postgres_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_db_lifecycle(
2222
caf_rows = return_cafs(storage)
2323
assert caf_rows == []
2424

25-
storage.add_allele_frequencies(caf_iri)
25+
storage.add_allele_frequencies([caf_iri])
2626
caf_rows = return_cafs(storage)
2727
assert len(caf_rows) == 1
2828

tests/unit/storage/test_postgres_unit.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def test_add_allele_frequencies(
6262
"""Test that add_allele_frequencies method works correctly"""
6363
caf = request.getfixturevalue(caf_fixture_name)
6464
try:
65-
postgres_storage.add_allele_frequencies(caf)
65+
postgres_storage.add_allele_frequencies([caf])
6666
except Exception as e: # noqa: BLE001
6767
pytest.fail(f"add_allele_frequencies raised an exception: {e}")
6868

@@ -75,7 +75,7 @@ def test_add_allele_frequencies(
7575
cohort=StudyGroup(name="rare disease"), # type: ignore
7676
) # type: ignore
7777

78-
postgres_storage.add_allele_frequencies(caf)
78+
postgres_storage.add_allele_frequencies([caf])
7979

8080

8181
def test_add_allele_frequencies_failures(
@@ -84,4 +84,4 @@ def test_add_allele_frequencies_failures(
8484
):
8585
"""Test that add_allele_frequencies method fails correctly on bad input"""
8686
with pytest.raises(IntegrityError, match='null value in column "cohort"'):
87-
postgres_storage.add_allele_frequencies(caf_empty_cohort)
87+
postgres_storage.add_allele_frequencies([caf_empty_cohort])

0 commit comments

Comments
 (0)