Skip to content

Commit dee025c

Browse files
authored
Merge pull request #587 from VariantEffect/feature/bencap/586/script-for-calibration-variant-effect-figures
feat: add script to count unique variant effect measurements within ACMG-classified ranges
2 parents 0a8b9e6 + 15424fe commit dee025c

File tree

1 file changed

+185
-0
lines changed

1 file changed

+185
-0
lines changed
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
"""
2+
Count unique variant effect measurements within ACMG-classified functional ranges.
3+
4+
This script analyzes MaveDB score sets to count how many variant effect measurements
5+
have functional scores that fall within score calibration ranges associated with
6+
ACMG (American College of Medical Genetics) classifications. The analysis provides
7+
insights into how many variants can be clinically interpreted using established
8+
evidence strength frameworks.
9+
10+
Usage:
11+
# Show help and available options
12+
with_mavedb_local poetry run python3 -m mavedb.scripts.effect_measurements --help
13+
14+
# Run in dry-run mode (default, no database changes, shows results)
15+
with_mavedb_local poetry run python3 -m mavedb.scripts.effect_measurements --dry-run
16+
17+
# Run and commit results (this script is read-only, so commit doesn't change anything)
18+
with_mavedb_local poetry run python3 -m mavedb.scripts.effect_measurements --commit
19+
20+
Behavior:
21+
1. Queries all non-superseded score sets that have score calibrations
22+
2. Identifies calibrations with functional ranges that have ACMG classifications
23+
3. For each qualifying score set, queries its variants with non-null scores
24+
4. Counts variants whose scores fall within ACMG-classified ranges
25+
5. Reports statistics on classification coverage
26+
27+
Key Filters:
28+
- Excludes superseded score sets (where superseding_score_set is not None)
29+
- Only processes score sets that have at least one score calibration
30+
- Only considers functional ranges with ACMG classification data
31+
- Only counts variants that have non-null functional scores
32+
- Each variant is counted only once per score set, even if it matches multiple ranges
33+
34+
ACMG Classification Detection:
35+
A functional range is considered to have an ACMG classification if its
36+
acmg_classification field contains any of:
37+
- criterion (PS3, BS3, etc.)
38+
- evidence_strength (Supporting, Moderate, Strong, Very Strong)
39+
- points (numeric evidence points)
40+
41+
Performance Notes:
42+
- Uses optimized queries to avoid loading unnecessary data
43+
- Loads score sets and calibrations first, then queries variants separately
44+
- Filters variants at the database level for better performance
45+
- Memory usage scales with the number of score sets with ACMG ranges
46+
47+
Output:
48+
- Progress updates for each score set with classified variants
49+
- Summary statistics including:
50+
* Number of score sets with ACMG classifications
51+
* Total unique variants processed
52+
* Number of variants within ACMG-classified ranges
53+
* Overall classification rate percentage
54+
55+
Caveats:
56+
- This is a read-only analysis script (makes no database changes)
57+
- Variants with null/missing scores are included in the analysis
58+
"""
59+
60+
import logging
61+
from typing import Set
62+
63+
import click
64+
from sqlalchemy import select
65+
from sqlalchemy.orm import Session, joinedload
66+
67+
from mavedb.models.score_set import ScoreSet
68+
from mavedb.scripts.environment import with_database_session
69+
from mavedb.view_models.score_calibration import FunctionalRange
70+
71+
logger = logging.getLogger(__name__)
72+
73+
74+
def score_falls_within_range(score: float, functional_range: dict) -> bool:
75+
"""Check if a score falls within a functional range using the view model."""
76+
try:
77+
range_obj = FunctionalRange.model_validate(functional_range)
78+
return range_obj.is_contained_by_range(score)
79+
except Exception as e:
80+
logger.warning(f"Error validating functional range: {e}")
81+
return False
82+
83+
84+
def has_acmg_classification(functional_range: dict) -> bool:
85+
"""Check if a functional range has an ACMG classification."""
86+
acmg_data = functional_range.get("acmg_classification")
87+
return acmg_data is not None and (
88+
acmg_data.get("criterion") is not None
89+
or acmg_data.get("evidence_strength") is not None
90+
or acmg_data.get("points") is not None
91+
)
92+
93+
94+
@click.command()
95+
@with_database_session
96+
def main(db: Session) -> None:
97+
"""Count unique variant effect measurements with ACMG-classified functional ranges."""
98+
99+
query = (
100+
select(ScoreSet)
101+
.options(joinedload(ScoreSet.score_calibrations))
102+
.where(ScoreSet.private.is_(False)) # Public score sets only
103+
.where(ScoreSet.superseded_score_set_id.is_(None)) # Not superseded
104+
.where(ScoreSet.score_calibrations.any()) # Has calibrations
105+
)
106+
107+
score_sets = db.scalars(query).unique().all()
108+
109+
total_variants = 0
110+
classified_variants = 0
111+
score_sets_with_acmg = 0
112+
processed_variants: Set[int] = set()
113+
gene_list: Set[str] = set()
114+
115+
click.echo(f"Found {len(score_sets)} non-superseded score sets with calibrations")
116+
117+
for score_set in score_sets:
118+
# Collect all ACMG-classified ranges from this score set's calibrations
119+
acmg_ranges = []
120+
for calibration in score_set.score_calibrations:
121+
if calibration.functional_ranges:
122+
for func_range in calibration.functional_ranges:
123+
if has_acmg_classification(func_range):
124+
acmg_ranges.append(func_range)
125+
126+
if not acmg_ranges:
127+
continue
128+
129+
score_sets_with_acmg += 1
130+
score_set_classified_variants = 0
131+
132+
# Retain a list of unique target genes for reporting
133+
for target in score_set.target_genes:
134+
target_name = target.name
135+
if not target_name:
136+
continue
137+
138+
gene_list.add(target_name.strip().upper())
139+
140+
for variant in score_set.variants:
141+
if variant.id in processed_variants:
142+
continue
143+
144+
variant_data = variant.data
145+
if not variant_data:
146+
continue
147+
148+
score_data = variant_data.get("score_data", {})
149+
score = score_data.get("score")
150+
151+
total_variants += 1
152+
processed_variants.add(variant.id) # type: ignore
153+
154+
if score is None:
155+
continue
156+
157+
# Check if score falls within any ACMG-classified range in this score set
158+
for func_range in acmg_ranges:
159+
if score_falls_within_range(float(score), func_range):
160+
classified_variants += 1
161+
score_set_classified_variants += 1
162+
break # Count variant only once per score set
163+
164+
if score_set_classified_variants > 0:
165+
click.echo(
166+
f"Score set {score_set.urn}: {score_set_classified_variants} classified variants ({score_set.num_variants} total variants)"
167+
)
168+
169+
click.echo("\n" + "=" * 60)
170+
click.echo("SUMMARY")
171+
click.echo("=" * 60)
172+
click.echo(f"Score sets with ACMG classifications: {score_sets_with_acmg}")
173+
click.echo(f"Total unique variants processed: {total_variants}")
174+
click.echo(f"Variants within ACMG-classified ranges: {classified_variants}")
175+
click.echo(f"Unique target genes covered ({len(gene_list)}):")
176+
for gene in sorted(gene_list):
177+
click.echo(f" - {gene}")
178+
179+
if total_variants > 0:
180+
percentage = (classified_variants / total_variants) * 100
181+
click.echo(f"Classification rate: {percentage:.1f}%")
182+
183+
184+
if __name__ == "__main__": # pragma: no cover
185+
main()

0 commit comments

Comments
 (0)