Skip to content

Commit 9876327

Browse files
authored
Merge pull request #573 from VariantEffect/bugfix/bencap/511/statistics-variant-effect-measurement-double-counting
fixed: variant count endpoint was returning a count including non-distinct variants
2 parents b155ed6 + cb5717c commit 9876327

File tree

1 file changed

+34
-14
lines changed

1 file changed

+34
-14
lines changed

src/mavedb/routers/statistics.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -515,18 +515,24 @@ def variant_counts(group: Optional[GroupBy] = None, db: Session = Depends(get_db
515515
Returns a dictionary of counts for the number of published and distinct variants in the database.
516516
Optionally, group the counts by the day on which the score set (and by extension, the variant) was published.
517517
"""
518-
variants = db.execute(
519-
select(PublishedVariantsMV.published_date, func.count(PublishedVariantsMV.variant_id))
518+
# Fast path: total distinct variants without per-date aggregation.
519+
if group is None:
520+
total = db.execute(select(func.count(func.distinct(PublishedVariantsMV.variant_id)))).scalar_one() # type: ignore
521+
return OrderedDict([("count", total)])
522+
523+
# Grouped path: materialize distinct counts per published_date, then roll up.
524+
per_date = db.execute(
525+
select(PublishedVariantsMV.published_date, func.count(func.distinct(PublishedVariantsMV.variant_id)))
520526
.group_by(PublishedVariantsMV.published_date)
521527
.order_by(PublishedVariantsMV.published_date)
522528
).all()
523529

524530
if group == GroupBy.month:
525-
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(variants, lambda t: t[0].strftime("%Y-%m"))}
531+
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(per_date, lambda t: t[0].strftime("%Y-%m"))}
526532
elif group == GroupBy.year:
527-
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(variants, lambda t: t[0].strftime("%Y"))}
528-
else:
529-
grouped = {"count": sum(count for _, count in variants)}
533+
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(per_date, lambda t: t[0].strftime("%Y"))}
534+
else: # Defensive fallback.
535+
grouped = {"count": sum(c for _, c in per_date)}
530536

531537
return OrderedDict(sorted(grouped.items()))
532538

@@ -540,20 +546,34 @@ def mapped_variant_counts(
540546
Optionally, group the counts by the day on which the score set (and by extension, the variant) was published.
541547
Optionally, return the count of all mapped variants, not just the current/most up to date ones.
542548
"""
543-
query = select(PublishedVariantsMV.published_date, func.count(PublishedVariantsMV.mapped_variant_id))
549+
# Fast path: total distinct mapped variants (optionally only current) without per-date aggregation.
550+
if group is None:
551+
total_stmt = select(func.count(func.distinct(PublishedVariantsMV.mapped_variant_id)))
552+
553+
if onlyCurrent:
554+
total_stmt = total_stmt.where(PublishedVariantsMV.current_mapped_variant.is_(True))
555+
556+
total = db.execute(total_stmt).scalar_one() # type: ignore
557+
return OrderedDict([("count", total)])
558+
559+
# Grouped path: materialize distinct counts per published_date, then roll up.
560+
per_date_stmt = select(
561+
PublishedVariantsMV.published_date,
562+
func.count(func.distinct(PublishedVariantsMV.mapped_variant_id)),
563+
)
544564

545565
if onlyCurrent:
546-
query = query.where(PublishedVariantsMV.current_mapped_variant.is_(True))
566+
per_date_stmt = per_date_stmt.where(PublishedVariantsMV.current_mapped_variant.is_(True))
547567

548-
variants = db.execute(
549-
query.group_by(PublishedVariantsMV.published_date).order_by(PublishedVariantsMV.published_date)
568+
per_date = db.execute(
569+
per_date_stmt.group_by(PublishedVariantsMV.published_date).order_by(PublishedVariantsMV.published_date)
550570
).all()
551571

552572
if group == GroupBy.month:
553-
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(variants, lambda t: t[0].strftime("%Y-%m"))}
573+
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(per_date, lambda t: t[0].strftime("%Y-%m"))}
554574
elif group == GroupBy.year:
555-
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(variants, lambda t: t[0].strftime("%Y"))}
556-
else:
557-
grouped = {"count": sum(count for _, count in variants)}
575+
grouped = {k: sum(c for _, c in g) for k, g in itertools.groupby(per_date, lambda t: t[0].strftime("%Y"))}
576+
else: # Defensive fallback.
577+
grouped = {"count": sum(c for _, c in per_date)}
558578

559579
return OrderedDict(sorted(grouped.items()))

0 commit comments

Comments
 (0)