-
Notifications
You must be signed in to change notification settings - Fork 3
Analysis performance: benchmarking suite for large variant queriesΒ #1494
Description
π€ Written by Claude.
Background
From performance analysis of CohortGenotype / Analysis node queries, several optimisation candidates were identified. Before implementing any of them, we need a reliable way to measure the actual impact.
One candidate β PopulationNode.keep_internally_classified_pathogenic materialising classified variant IDs as a Python list (pk__in=[id, id, ...]) rather than a subquery β has already been benchmarked and the list is faster in practice, despite being less elegant. This issue tracks building a proper benchmarking harness so future candidates can be evaluated the same way.
See claude/analysis_performance_gains.md for the full list of candidates.
Proposed: benchmark_analysis_queries management command
A Django management command that:
- Finds realistic data automatically (large samples, cohorts, trios from the live DB)
- Runs each candidate query pattern multiple times and reports median wall time +
EXPLAIN ANALYZEoutput - Accepts a
--node-typeflag to target specific node types
Sketch
# analysis/management/commands/benchmark_analysis_queries.py
from django.core.management.base import BaseCommand
from django.db import connection, reset_queries
import time, statistics
class Command(BaseCommand):
help = "Benchmark analysis node query patterns against real DB data"
def add_arguments(self, parser):
parser.add_argument('--repeats', type=int, default=5)
parser.add_argument('--node-type', choices=['sample', 'cohort', 'trio', 'gene_list', 'population', 'all'],
default='all')
parser.add_argument('--explain', action='store_true', help='Print EXPLAIN ANALYZE for each query')
def handle(self, *args, **options):
repeats = options['repeats']
benchmarks = self._collect_benchmarks(options['node_type'])
for name, get_qs in benchmarks:
self._run(name, get_qs, repeats, options['explain'])
def _run(self, name, get_qs, repeats, explain):
times = []
for _ in range(repeats):
qs = get_qs()
t0 = time.perf_counter()
count = qs.count()
times.append(time.perf_counter() - t0)
median = statistics.median(times)
self.stdout.write(f"{name}: count={count} median={median*1000:.1f}ms over {repeats} runs")
if explain:
qs = get_qs()
with connection.cursor() as cursor:
sql, params = qs.query.sql_with_params()
cursor.execute(f"EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT) {sql}", params)
for row in cursor.fetchall():
self.stdout.write(row[0])
def _collect_benchmarks(self, node_type):
benchmarks = []
if node_type in ('sample', 'all'):
benchmarks += self._sample_benchmarks()
if node_type in ('cohort', 'all'):
benchmarks += self._cohort_benchmarks()
if node_type in ('trio', 'all'):
benchmarks += self._trio_benchmarks()
if node_type in ('gene_list', 'all'):
benchmarks += self._gene_list_benchmarks()
if node_type in ('population', 'all'):
benchmarks += self._population_benchmarks()
return benchmarksFinding real data
The command should pick representative large objects automatically, e.g.:
def _get_largest_sample(self):
from snpdb.models import Sample, SampleStats
# Pick the sample with the most variants
ss = SampleStats.objects.order_by('-variant_count').select_related('sample').first()
return ss.sample if ss else Sample.objects.order_by('-pk').first()
def _get_largest_cohort(self):
from snpdb.models import Cohort, CohortGenotypeCollection
# Cohort with the biggest CGC partition (most rows)
cgc = (CohortGenotypeCollection.objects
.filter(cohort__genome_build__isnull=False)
.order_by('-cohort__sample_count', '-pk')
.select_related('cohort').first())
return cgc.cohort if cgc else None
def _get_trio(self):
from snpdb.models import Trio
return Trio.objects.order_by('-pk').first()Sample node benchmark (example)
def _sample_benchmarks(self):
from snpdb.models import Variant
sample = self._get_largest_sample()
if not sample:
return []
cgc = sample.vcf.cohort.cohort_genotype_collection
annotation_kwargs = cgc.get_annotation_kwargs()
alias = cgc.cohortgenotype_alias
def qs_het_only():
from patients.models_enums import Zygosity
from django.db.models import Q
qs = Variant.objects.annotate(**annotation_kwargs)
qs = qs.filter(Q(**{f"{alias}__samples_zygosity__regex": r'^H'}))
return qs
def qs_het_via_substr():
from django.db.models import Q
ann = sample.get_annotation_kwargs()
zyg_alias = sample.zygosity_alias
qs = Variant.objects.annotate(**annotation_kwargs).annotate(**ann)
qs = qs.filter(Q(**{f"{zyg_alias}__in": ['H']}))
return qs
return [
(f"sample({sample.pk}) het via regex", qs_het_only),
(f"sample({sample.pk}) het via substr+in", qs_het_via_substr),
]Population node benchmark (the list-vs-subquery case)
def _population_benchmarks(self):
# This was already benchmarked - list wins. Keep for regression testing.
from annotation.models import VariantAnnotation
from snpdb.models import Variant
from django.db.models import Q
def qs_af_filter_list(variant_ids):
return Variant.objects.filter(
pk__in=variant_ids,
variantannotation__gnomad_af__lte=0.01,
)
def qs_af_filter_subquery(parent_qs):
return parent_qs.filter(variantannotation__gnomad_af__lte=0.01)
# ... build parent_qs and variant_ids from largest sample
sample = self._get_largest_sample()
if not sample:
return []
parent_qs = sample.get_variant_qs()
variant_ids = list(parent_qs.values_list('pk', flat=True)[:50_000])
return [
("population AF filter via list (current)", lambda: qs_af_filter_list(variant_ids)),
("population AF filter via subquery", lambda: qs_af_filter_subquery(parent_qs)),
]Gene list benchmark (VariantGeneOverlap index)
def _gene_list_benchmarks(self):
from annotation.models import VariantAnnotationVersion, VariantGeneOverlap
from snpdb.models import Variant
from django.db.models import Q
vav = VariantAnnotationVersion.objects.order_by('-pk').first()
if not vav:
return []
# Pick a reasonably-sized gene list (e.g. ACMG59 genes)
from genes.models import GeneList
gene_list = GeneList.objects.order_by('-genelistgenesymbol_count').first()
if not gene_list:
return []
gene_ids = list(gene_list.get_gene_ids())
def qs_via_vgo():
vgo = VariantGeneOverlap.objects.filter(version=vav, gene__in=gene_ids)
return Variant.objects.filter(pk__in=vgo.values_list('variant_id', flat=True))
def qs_via_transcript_annotation():
from annotation.models import VariantTranscriptAnnotation
vta = VariantTranscriptAnnotation.objects.filter(version=vav, gene__in=gene_ids)
return Variant.objects.filter(pk__in=vta.values_list('variant_id', flat=True)).distinct()
return [
("gene list via VariantGeneOverlap (current)", qs_via_vgo),
("gene list via VariantTranscriptAnnotation+distinct (old)", qs_via_transcript_annotation),
]What makes this useful
- Runs against real data β avoids the trap where synthetic benchmarks don't reflect actual data distribution (e.g. how many HET calls are in the zygosity string, how many variants are in a gene list).
- Repeats with median β avoids one-off cache effects.
- EXPLAIN ANALYZE β makes it easy to spot seq scans vs index scans and spot when the planner is making bad choices.
- Pairs of queries β each benchmark should test current vs proposed so the trade-off is visible.
- Regression guard β once a fast path is chosen, future changes that regress it will be visible immediately.