Skip to content

Commit bc29fc3

Browse files
authored
Merge pull request #303 from VariantEffect/feature/bencap/302/require-fully-qualified-variants-for-accession-based-score-sets
Require fully qualified variants for accession based score sets
2 parents b1ed277 + 074c863 commit bc29fc3

File tree

10 files changed

+131
-96
lines changed

10 files changed

+131
-96
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import sqlalchemy as sa
2+
from sqlalchemy.orm import Session, configure_mappers
3+
4+
from mavedb.models import *
5+
6+
from mavedb.models.score_set import ScoreSet
7+
from mavedb.models.variant import Variant
8+
from mavedb.models.target_gene import TargetGene
9+
from mavedb.models.target_accession import TargetAccession
10+
11+
from mavedb.db.session import SessionLocal
12+
13+
configure_mappers()
14+
15+
16+
def do_migration(db: Session):
17+
accession_based_score_sets = db.execute(
18+
sa.select(ScoreSet).join(TargetGene).where(TargetGene.accession_id.isnot(None))
19+
).scalars()
20+
21+
for score_set in accession_based_score_sets:
22+
total_targets = len(
23+
list(db.execute(sa.select(TargetGene).where(TargetGene.score_set_id == score_set.id)).scalars())
24+
)
25+
26+
# Variants from score sets with multiple targets are already in the desired format.
27+
if total_targets > 1:
28+
continue
29+
30+
target_accession = db.execute(
31+
sa.select(TargetAccession.accession).join(TargetGene).where(TargetGene.score_set_id == score_set.id)
32+
).scalar()
33+
variants = db.execute(sa.select(Variant).where(Variant.score_set_id == score_set.id)).scalars()
34+
35+
if target_accession is None:
36+
raise ValueError("target accession should never be None.")
37+
38+
for variant in variants:
39+
if variant.hgvs_nt:
40+
variant.hgvs_nt = f"{target_accession}:{variant.hgvs_nt}"
41+
if variant.hgvs_pro:
42+
variant.hgvs_pro = f"{target_accession}:{variant.hgvs_pro}"
43+
if variant.hgvs_splice:
44+
variant.hgvs_splice = f"{target_accession}:{variant.hgvs_splice}"
45+
46+
db.add(variant)
47+
48+
49+
if __name__ == "__main__":
50+
db = SessionLocal()
51+
db.current_user = None # type: ignore
52+
53+
do_migration(db)
54+
55+
db.commit()
56+
db.close()

src/mavedb/lib/validation/dataframe.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -202,20 +202,23 @@ def validate_dataframe(df: pd.DataFrame, kind: str, targets: list["TargetGene"],
202202
if df[column_mapping[c]].isna().all() and not is_index:
203203
continue
204204

205+
score_set_is_accession_based = all(target.target_accession for target in targets)
206+
score_set_is_sequence_based = all(target.target_sequence for target in targets)
207+
205208
# This is typesafe, despite Pylance's claims otherwise
206-
if all(target.target_accession for target in targets):
209+
if score_set_is_accession_based and not score_set_is_sequence_based:
207210
validate_hgvs_genomic_column(
208211
df[column_mapping[c]], is_index, [target.target_accession for target in targets], hdp # type: ignore
209212
)
210-
elif all(target.target_sequence for target in targets):
213+
elif score_set_is_sequence_based and not score_set_is_accession_based:
211214
validate_hgvs_transgenic_column(
212215
df[column_mapping[c]], is_index, {target.target_sequence.label: target.target_sequence for target in targets} # type: ignore
213216
)
214217
else:
215218
raise MixedTargetError("Could not validate dataframe against provided mixed target types.")
216219

217220
# post validation, handle prefixes. We've already established these columns are non-null
218-
if len(targets) > 1:
221+
if score_set_is_accession_based or len(targets) > 1:
219222
prefixes[c] = (
220223
df[column_mapping[c]].dropna()[0].split(" ")[0].split(":")[1][0]
221224
) # Just take the first prefix, we validate consistency elsewhere
@@ -374,7 +377,7 @@ def validate_hgvs_transgenic_column(column: pd.Series, is_index: bool, targets:
374377
valid_sequence_types = ("dna", "protein")
375378
validate_variant_column(column, is_index)
376379
prefixes = generate_variant_prefixes(column)
377-
validate_variant_formatting(column, prefixes, list(targets.keys()))
380+
validate_variant_formatting(column, prefixes, list(targets.keys()), len(targets) > 1)
378381

379382
observed_sequence_types = [target.sequence_type for target in targets.values()]
380383
invalid_sequence_types = set(observed_sequence_types) - set(valid_sequence_types)
@@ -454,9 +457,6 @@ def validate_hgvs_genomic_column(
454457
This function also validates all individual variants in the column and checks for agreement against the target
455458
sequence (for non-splice variants).
456459
457-
Implementation NOTE: We assume variants will only be presented as fully qualified (accession:variant)
458-
if this column is being validated against multiple targets.
459-
460460
Parameters
461461
----------
462462
column : pd.Series
@@ -482,7 +482,7 @@ def validate_hgvs_genomic_column(
482482
validate_variant_column(column, is_index)
483483
prefixes = generate_variant_prefixes(column)
484484
validate_variant_formatting(
485-
column, prefixes, [target.accession for target in targets if target.accession is not None]
485+
column, prefixes, [target.accession for target in targets if target.accession is not None], True
486486
)
487487

488488
# validate the individual variant strings
@@ -508,12 +508,9 @@ def validate_hgvs_genomic_column(
508508
for i, s in column.items():
509509
if s is not None:
510510
for variant in s.split(" "):
511-
# Add accession info when we only have one target
512-
if len(targets) == 1:
513-
s = f"{targets[0].accession}:{variant}"
514511
try:
515512
# We set strict to `False` to suppress validation warnings about intronic variants.
516-
vr.validate(hp.parse(s), strict=False)
513+
vr.validate(hp.parse(variant), strict=False)
517514
except hgvs.exceptions.HGVSError as e:
518515
invalid_variants.append(f"Failed to parse row {i} with HGVS exception: {e}")
519516

@@ -524,7 +521,7 @@ def validate_hgvs_genomic_column(
524521
)
525522

526523

527-
def validate_variant_formatting(column: pd.Series, prefixes: list[str], targets: list[str]):
524+
def validate_variant_formatting(column: pd.Series, prefixes: list[str], targets: list[str], fully_qualified: bool):
528525
"""
529526
Validate the formatting of HGVS variants present in the passed column against
530527
lists of prefixes and targets
@@ -554,7 +551,7 @@ def validate_variant_formatting(column: pd.Series, prefixes: list[str], targets:
554551
variants = [variant for s in column.dropna() for variant in s.split(" ")]
555552

556553
# if there is more than one target, we expect variants to be fully qualified
557-
if len(targets) > 1:
554+
if fully_qualified:
558555
if not all(len(str(v).split(":")) == 2 for v in variants):
559556
raise ValidationError(
560557
f"variant column '{column.name}' needs fully qualified coordinates when validating against multiple targets"

tests/conftest.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,14 @@ def session(postgresql):
4848
)
4949

5050
engine = create_engine(connection, echo=False, poolclass=NullPool)
51-
session = sessionmaker(autocommit=False, autoflush=False, bind=engine)
51+
session = sessionmaker(autocommit=False, autoflush=False, bind=engine)()
5252

5353
Base.metadata.create_all(bind=engine)
5454

5555
try:
56-
yield session()
56+
yield session
5757
finally:
58+
session.close()
5859
Base.metadata.drop_all(bind=engine)
5960

6061

@@ -170,7 +171,14 @@ async def on_job(ctx):
170171

171172
@pytest.fixture
172173
def standalone_worker_context(session, data_provider, arq_redis):
173-
yield {"db": session, "hdp": data_provider, "state": {}, "job_id": "test_job", "redis": arq_redis, "pool": futures.ProcessPoolExecutor()}
174+
yield {
175+
"db": session,
176+
"hdp": data_provider,
177+
"state": {},
178+
"job_id": "test_job",
179+
"redis": arq_redis,
180+
"pool": futures.ProcessPoolExecutor(),
181+
}
174182

175183

176184
@pytest.fixture()

tests/helpers/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def mock_worker_variant_insertion(client, db, data_provider, score_set, scores_c
126126
score_df = csv_data_to_df(score_file)
127127

128128
if counts_csv_path is not None:
129-
with open(scores_csv_path, "rb") as score_file:
129+
with open(scores_csv_path, "rb") as counts_file:
130130
counts_df = csv_data_to_df(counts_file)
131131
else:
132132
counts_df = None

tests/routers/data/counts_acc.csv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
hgvs_nt,c_0,c_1
2-
c.1G>C,10,20
3-
c.2A>G,8,8
4-
c.6C>A,90,2
2+
NM_001637.3:c.1G>C,10,20
3+
NM_001637.3:c.2A>G,8,8
4+
NM_001637.3:c.6C>A,90,2

tests/routers/data/scores_acc.csv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
hgvs_nt,score
2-
c.1G>C,0.3
3-
c.2A>G,0.0
4-
c.6C>A,-1.65
2+
NM_001637.3:c.1G>C,0.3
3+
NM_001637.3:c.2A>G,0.0
4+
NM_001637.3:c.6C>A,-1.65

tests/validation/test_dataframe.py

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -515,34 +515,34 @@ def setUp(self) -> None:
515515
self.valid_targets = ["test1", "test2"]
516516

517517
def test_single_target_valid_variants(self):
518-
validate_variant_formatting(self.valid, self.valid_prefixes, self.valid_target)
518+
validate_variant_formatting(self.valid, self.valid_prefixes, self.valid_target, False)
519519

520520
def test_single_target_inconsistent_variants(self):
521521
with self.assertRaises(ValidationError):
522-
validate_variant_formatting(self.inconsistent, self.valid_prefixes, self.valid_target)
522+
validate_variant_formatting(self.inconsistent, self.valid_prefixes, self.valid_target, False)
523523

524524
def test_single_target_invalid_prefixes(self):
525525
with self.assertRaises(ValidationError):
526-
validate_variant_formatting(self.valid, self.invalid_prefixes, self.valid_target)
526+
validate_variant_formatting(self.valid, self.invalid_prefixes, self.valid_target, False)
527527

528528
def test_multi_target_valid_variants(self):
529-
validate_variant_formatting(self.valid_multi, self.valid_prefixes, self.valid_targets)
529+
validate_variant_formatting(self.valid_multi, self.valid_prefixes, self.valid_targets, True)
530530

531531
def test_multi_target_inconsistent_variants(self):
532532
with self.assertRaises(ValidationError):
533-
validate_variant_formatting(self.inconsistent_multi, self.valid_prefixes, self.valid_targets)
533+
validate_variant_formatting(self.inconsistent_multi, self.valid_prefixes, self.valid_targets, True)
534534

535535
def test_multi_target_invalid_prefixes(self):
536536
with self.assertRaises(ValidationError):
537-
validate_variant_formatting(self.valid_multi, self.invalid_prefixes, self.valid_targets)
537+
validate_variant_formatting(self.valid_multi, self.invalid_prefixes, self.valid_targets, True)
538538

539539
def test_multi_target_lacking_full_coords(self):
540540
with self.assertRaises(ValidationError):
541-
validate_variant_formatting(self.valid, self.valid_prefixes, self.valid_targets)
541+
validate_variant_formatting(self.valid, self.valid_prefixes, self.valid_targets, True)
542542

543543
def test_multi_target_invalid_accessions(self):
544544
with self.assertRaises(ValidationError):
545-
validate_variant_formatting(self.invalid_multi, self.valid_prefixes, self.valid_targets)
545+
validate_variant_formatting(self.invalid_multi, self.valid_prefixes, self.valid_targets, True)
546546

547547

548548
class TestGenerateVariantPrefixes(DfTestCase):
@@ -910,27 +910,38 @@ def setUp(self):
910910

911911
self.accession_test_case = AccessionTestCase()
912912

913-
self.valid_hgvs_column = pd.Series(["c.1G>A", "c.2A>T"], name=hgvs_nt_column)
914-
self.missing_data = pd.Series(["c.3T>G", None], name=hgvs_nt_column)
915-
self.duplicate_data = pd.Series(["c.4A>G", "c.4A>G"], name=hgvs_nt_column)
913+
self.valid_hgvs_column = pd.Series(
914+
[f"{VALID_ACCESSION}:c.1G>A", f"{VALID_ACCESSION}:c.2A>T"], name=hgvs_nt_column
915+
)
916+
self.missing_data = pd.Series([f"{VALID_ACCESSION}:c.3T>G", None], name=hgvs_nt_column)
917+
self.duplicate_data = pd.Series([f"{VALID_ACCESSION}:c.4A>G", f"{VALID_ACCESSION}:c.4A>G"], name=hgvs_nt_column)
916918

917919
self.invalid_hgvs_columns_by_name = [
918-
pd.Series(["g.1A>G", "g.1A>T"], name=hgvs_splice_column),
919-
pd.Series(["g.1A>G", "g.1A>T"], name=hgvs_pro_column),
920-
pd.Series(["c.1A>G", "c.1A>T"], name=hgvs_pro_column),
921-
pd.Series(["n.1A>G", "n.1A>T"], name=hgvs_pro_column),
922-
pd.Series(["p.Met1Val", "p.Met1Leu"], name=hgvs_nt_column),
920+
pd.Series([f"{VALID_ACCESSION}:g.1A>G", f"{VALID_ACCESSION}:g.1A>T"], name=hgvs_splice_column),
921+
pd.Series([f"{VALID_ACCESSION}:g.1A>G", f"{VALID_ACCESSION}:g.1A>T"], name=hgvs_pro_column),
922+
pd.Series([f"{VALID_ACCESSION}:c.1A>G", f"{VALID_ACCESSION}:c.1A>T"], name=hgvs_pro_column),
923+
pd.Series([f"{VALID_ACCESSION}:n.1A>G", f"{VALID_ACCESSION}:n.1A>T"], name=hgvs_pro_column),
924+
pd.Series([f"{VALID_ACCESSION}:p.Met1Val", f"{VALID_ACCESSION}:p.Met1Leu"], name=hgvs_nt_column),
923925
]
924926

925927
self.invalid_hgvs_columns_by_contents = [
926-
pd.Series(["r.1a>g", "r.1a>u"], name=hgvs_splice_column), # rna not allowed
927-
pd.Series(["r.1a>g", "r.1a>u"], name=hgvs_nt_column), # rna not allowed
928-
pd.Series(["c.1A>G", "c.5A>T"], name=hgvs_nt_column), # out of bounds for target
929-
pd.Series(["c.1A>G", "_wt"], name=hgvs_nt_column), # old special variant
930-
pd.Series(["p.Met1Leu", "_sy"], name=hgvs_pro_column), # old special variant
931-
pd.Series(["n.1A>G", "c.1A>T"], name=hgvs_nt_column), # mixed prefix
932-
pd.Series(["c.1A>G", "p.Met1Leu"], name=hgvs_pro_column), # mixed types/prefix
933-
pd.Series(["c.1A>G", 2.5], name=hgvs_nt_column), # contains numeric
928+
pd.Series(
929+
[f"{VALID_ACCESSION}:r.1a>g", f"{VALID_ACCESSION}:r.1a>u"], name=hgvs_splice_column
930+
), # rna not allowed
931+
pd.Series(
932+
[f"{VALID_ACCESSION}:r.1a>g", f"{VALID_ACCESSION}:r.1a>u"], name=hgvs_nt_column
933+
), # rna not allowed
934+
pd.Series(
935+
[f"{VALID_ACCESSION}:c.1A>G", f"{VALID_ACCESSION}:c.5A>T"], name=hgvs_nt_column
936+
), # out of bounds for target
937+
pd.Series([f"{VALID_ACCESSION}:c.1A>G", "_wt"], name=hgvs_nt_column), # old special variant
938+
pd.Series([f"{VALID_ACCESSION}:p.Met1Leu", "_sy"], name=hgvs_pro_column), # old special variant
939+
pd.Series([f"{VALID_ACCESSION}:n.1A>G", f"{VALID_ACCESSION}:c.1A>T"], name=hgvs_nt_column), # mixed prefix
940+
pd.Series(
941+
[f"{VALID_ACCESSION}:c.1A>G", f"{VALID_ACCESSION}:p.Met1Leu"], name=hgvs_pro_column
942+
), # mixed types/prefix
943+
pd.Series(["c.1A>G", "p.Met1Leu"], name=hgvs_pro_column), # variants should be fully qualified
944+
pd.Series([f"{VALID_ACCESSION}:c.1A>G", 2.5], name=hgvs_nt_column), # contains numeric
934945
pd.Series([1.0, 2.5], name=hgvs_nt_column), # contains numeric
935946
pd.Series([1.0, 2.5], name=hgvs_splice_column), # contains numeric
936947
pd.Series([1.0, 2.5], name=hgvs_pro_column), # contains numeric

tests/worker/data/counts_acc.csv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
hgvs_nt,c_0,c_1
2-
c.1G>C,10,20
3-
c.2A>G,8,8
4-
c.6C>A,90,2
2+
NM_001637.3:c.1G>C,10,20
3+
NM_001637.3:c.2A>G,8,8
4+
NM_001637.3:c.6C>A,90,2

tests/worker/data/scores_acc.csv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
hgvs_nt,score
2-
c.1G>C,0.3
3-
c.2A>G,0.0
4-
c.6C>A,-1.65
2+
NM_001637.3:c.1G>C,0.3
3+
NM_001637.3:c.2A>G,0.0
4+
NM_001637.3:c.6C>A,-1.65

0 commit comments

Comments
 (0)