Skip to content

Analysis performance: benchmarking suite for large variant queriesΒ #1494

@davmlaw

Description

@davmlaw

πŸ€– Written by Claude.

Background

From performance analysis of CohortGenotype / Analysis node queries, several optimisation candidates were identified. Before implementing any of them, we need a reliable way to measure the actual impact.

One candidate β€” PopulationNode.keep_internally_classified_pathogenic materialising classified variant IDs as a Python list (pk__in=[id, id, ...]) rather than a subquery β€” has already been benchmarked and the list is faster in practice, despite being less elegant. This issue tracks building a proper benchmarking harness so future candidates can be evaluated the same way.

See claude/analysis_performance_gains.md for the full list of candidates.


Proposed: benchmark_analysis_queries management command

A Django management command that:

  1. Finds realistic data automatically (large samples, cohorts, trios from the live DB)
  2. Runs each candidate query pattern multiple times and reports median wall time + EXPLAIN ANALYZE output
  3. Accepts a --node-type flag to target specific node types

Sketch

# analysis/management/commands/benchmark_analysis_queries.py

from django.core.management.base import BaseCommand
from django.db import connection, reset_queries
import time, statistics

class Command(BaseCommand):
    help = "Benchmark analysis node query patterns against real DB data"

    def add_arguments(self, parser):
        parser.add_argument('--repeats', type=int, default=5)
        parser.add_argument('--node-type', choices=['sample', 'cohort', 'trio', 'gene_list', 'population', 'all'],
                            default='all')
        parser.add_argument('--explain', action='store_true', help='Print EXPLAIN ANALYZE for each query')

    def handle(self, *args, **options):
        repeats = options['repeats']
        benchmarks = self._collect_benchmarks(options['node_type'])
        for name, get_qs in benchmarks:
            self._run(name, get_qs, repeats, options['explain'])

    def _run(self, name, get_qs, repeats, explain):
        times = []
        for _ in range(repeats):
            qs = get_qs()
            t0 = time.perf_counter()
            count = qs.count()
            times.append(time.perf_counter() - t0)
        median = statistics.median(times)
        self.stdout.write(f"{name}: count={count} median={median*1000:.1f}ms over {repeats} runs")
        if explain:
            qs = get_qs()
            with connection.cursor() as cursor:
                sql, params = qs.query.sql_with_params()
                cursor.execute(f"EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT) {sql}", params)
                for row in cursor.fetchall():
                    self.stdout.write(row[0])

    def _collect_benchmarks(self, node_type):
        benchmarks = []
        if node_type in ('sample', 'all'):
            benchmarks += self._sample_benchmarks()
        if node_type in ('cohort', 'all'):
            benchmarks += self._cohort_benchmarks()
        if node_type in ('trio', 'all'):
            benchmarks += self._trio_benchmarks()
        if node_type in ('gene_list', 'all'):
            benchmarks += self._gene_list_benchmarks()
        if node_type in ('population', 'all'):
            benchmarks += self._population_benchmarks()
        return benchmarks

Finding real data

The command should pick representative large objects automatically, e.g.:

def _get_largest_sample(self):
    from snpdb.models import Sample, SampleStats
    # Pick the sample with the most variants
    ss = SampleStats.objects.order_by('-variant_count').select_related('sample').first()
    return ss.sample if ss else Sample.objects.order_by('-pk').first()

def _get_largest_cohort(self):
    from snpdb.models import Cohort, CohortGenotypeCollection
    # Cohort with the biggest CGC partition (most rows)
    cgc = (CohortGenotypeCollection.objects
           .filter(cohort__genome_build__isnull=False)
           .order_by('-cohort__sample_count', '-pk')
           .select_related('cohort').first())
    return cgc.cohort if cgc else None

def _get_trio(self):
    from snpdb.models import Trio
    return Trio.objects.order_by('-pk').first()

Sample node benchmark (example)

def _sample_benchmarks(self):
    from snpdb.models import Variant
    sample = self._get_largest_sample()
    if not sample:
        return []
    cgc = sample.vcf.cohort.cohort_genotype_collection
    annotation_kwargs = cgc.get_annotation_kwargs()
    alias = cgc.cohortgenotype_alias

    def qs_het_only():
        from patients.models_enums import Zygosity
        from django.db.models import Q
        qs = Variant.objects.annotate(**annotation_kwargs)
        qs = qs.filter(Q(**{f"{alias}__samples_zygosity__regex": r'^H'}))
        return qs

    def qs_het_via_substr():
        from django.db.models import Q
        ann = sample.get_annotation_kwargs()
        zyg_alias = sample.zygosity_alias
        qs = Variant.objects.annotate(**annotation_kwargs).annotate(**ann)
        qs = qs.filter(Q(**{f"{zyg_alias}__in": ['H']}))
        return qs

    return [
        (f"sample({sample.pk}) het via regex", qs_het_only),
        (f"sample({sample.pk}) het via substr+in", qs_het_via_substr),
    ]

Population node benchmark (the list-vs-subquery case)

def _population_benchmarks(self):
    # This was already benchmarked - list wins. Keep for regression testing.
    from annotation.models import VariantAnnotation
    from snpdb.models import Variant
    from django.db.models import Q

    def qs_af_filter_list(variant_ids):
        return Variant.objects.filter(
            pk__in=variant_ids,
            variantannotation__gnomad_af__lte=0.01,
        )

    def qs_af_filter_subquery(parent_qs):
        return parent_qs.filter(variantannotation__gnomad_af__lte=0.01)

    # ... build parent_qs and variant_ids from largest sample
    sample = self._get_largest_sample()
    if not sample:
        return []
    parent_qs = sample.get_variant_qs()
    variant_ids = list(parent_qs.values_list('pk', flat=True)[:50_000])

    return [
        ("population AF filter via list (current)", lambda: qs_af_filter_list(variant_ids)),
        ("population AF filter via subquery", lambda: qs_af_filter_subquery(parent_qs)),
    ]

Gene list benchmark (VariantGeneOverlap index)

def _gene_list_benchmarks(self):
    from annotation.models import VariantAnnotationVersion, VariantGeneOverlap
    from snpdb.models import Variant
    from django.db.models import Q

    vav = VariantAnnotationVersion.objects.order_by('-pk').first()
    if not vav:
        return []

    # Pick a reasonably-sized gene list (e.g. ACMG59 genes)
    from genes.models import GeneList
    gene_list = GeneList.objects.order_by('-genelistgenesymbol_count').first()
    if not gene_list:
        return []

    gene_ids = list(gene_list.get_gene_ids())

    def qs_via_vgo():
        vgo = VariantGeneOverlap.objects.filter(version=vav, gene__in=gene_ids)
        return Variant.objects.filter(pk__in=vgo.values_list('variant_id', flat=True))

    def qs_via_transcript_annotation():
        from annotation.models import VariantTranscriptAnnotation
        vta = VariantTranscriptAnnotation.objects.filter(version=vav, gene__in=gene_ids)
        return Variant.objects.filter(pk__in=vta.values_list('variant_id', flat=True)).distinct()

    return [
        ("gene list via VariantGeneOverlap (current)", qs_via_vgo),
        ("gene list via VariantTranscriptAnnotation+distinct (old)", qs_via_transcript_annotation),
    ]

What makes this useful

  • Runs against real data β€” avoids the trap where synthetic benchmarks don't reflect actual data distribution (e.g. how many HET calls are in the zygosity string, how many variants are in a gene list).
  • Repeats with median β€” avoids one-off cache effects.
  • EXPLAIN ANALYZE β€” makes it easy to spot seq scans vs index scans and spot when the planner is making bad choices.
  • Pairs of queries β€” each benchmark should test current vs proposed so the trade-off is visible.
  • Regression guard β€” once a fast path is chosen, future changes that regress it will be visible immediately.

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions