Skip to content

Commit 7234594

Browse files
authored
gene cnv optimisation (#655)
1 parent 33b8535 commit 7234594

File tree

4 files changed

+62
-21
lines changed

4 files changed

+62
-21
lines changed

malariagen_data/anopheles.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -886,14 +886,24 @@ def _gene_cnv(
886886
chunks,
887887
inline_array,
888888
):
889-
debug = self._log.debug
890-
891-
debug("sanity check")
889+
# Sanity check.
892890
assert isinstance(region, Region)
893891

894-
debug("access HMM data")
892+
# Access genes within the region of interest.
893+
df_genome_features = self.genome_features(region=region)
894+
sample_query_options = sample_query_options or {}
895+
df_genes = df_genome_features.query(
896+
f"type == '{self._gff_gene_type}'", **sample_query_options
897+
)
898+
899+
# Refine the region for CNV data to ensure coverage of all requested genes.
900+
cnv_region = Region(
901+
region.contig, df_genes["start"].min(), df_genes["end"].max()
902+
)
903+
904+
# Access HMM data.
895905
ds_hmm = self.cnv_hmm(
896-
region=region.contig,
906+
region=cnv_region,
897907
sample_sets=sample_sets,
898908
sample_query=sample_query,
899909
sample_query_options=sample_query_options,
@@ -907,45 +917,38 @@ def _gene_cnv(
907917
with self._dask_progress(desc="Load CNV HMM data"):
908918
pos, end, cn = dask.compute(pos, end, cn)
909919

910-
debug("access genes")
911-
df_genome_features = self.genome_features(region=region)
912-
sample_query_options = sample_query_options or {}
913-
df_genes = df_genome_features.query(
914-
f"type == '{self._gff_gene_type}'", **sample_query_options
915-
)
916-
917-
debug("setup intermediates")
920+
# Set up intermediates.
918921
windows = []
919922
modes = []
920923
counts = []
921924

922-
debug("iterate over genes")
925+
# Iterate over genes.
923926
genes_iterator = self._progress(
924927
df_genes.itertuples(),
925928
desc="Compute modal gene copy number",
926929
total=len(df_genes),
927930
)
928931
for gene in genes_iterator:
929-
# locate windows overlapping the gene
932+
# Locate windows overlapping the gene.
930933
loc_gene_start = bisect_left(end, gene.start)
931934
loc_gene_stop = bisect_right(pos, gene.end)
932935
w = loc_gene_stop - loc_gene_start
933936
windows.append(w)
934937

935-
# slice out copy number data for the given gene
938+
# Slice out copy number data for the given gene.
936939
cn_gene = cn[loc_gene_start:loc_gene_stop]
937940

938-
# compute the modes
941+
# Compute the modes.
939942
m, c = _cn_mode(cn_gene, vmax=12)
940943
modes.append(m)
941944
counts.append(c)
942945

943-
debug("combine results")
946+
# Combine results.
944947
windows = np.array(windows)
945948
modes = np.vstack(modes)
946949
counts = np.vstack(counts)
947950

948-
debug("build dataset")
951+
# Build dataset.
949952
ds_out = xr.Dataset(
950953
coords={
951954
"gene_id": (["genes"], df_genes["ID"].values),

notebooks/plot_frequencies_heatmap.ipynb

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,44 @@
381381
"id": "86c5c594",
382382
"metadata": {},
383383
"outputs": [],
384+
"source": [
385+
"interesting_cyp_genes = [\n",
386+
" \"AGAP002862\", # Cyp6aa1\n",
387+
" \"AGAP013128\", # Cyp6aa2\n",
388+
" \"AGAP002865\", # Cyp6p3\n",
389+
" \"AGAP000818\", # Cyp9k1\n",
390+
" \"AGAP008212\", # Cyp6m2\n",
391+
" \"AGAP008218\", # Cyp6z2 \n",
392+
"]\n",
393+
"\n",
394+
"cyp_cnv_freqs_df = ag3.gene_cnv_frequencies(\n",
395+
" region=interesting_cyp_genes,\n",
396+
" cohorts=\"admin1_year\",\n",
397+
" sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"),\n",
398+
" sample_query=\"taxon == 'coluzzii'\",\n",
399+
")"
400+
]
401+
},
402+
{
403+
"cell_type": "code",
404+
"execution_count": null,
405+
"id": "6d7ad130-30c2-4cd3-8906-a7ada3ccc75f",
406+
"metadata": {},
407+
"outputs": [],
408+
"source": [
409+
"ag3.plot_frequencies_heatmap(\n",
410+
" df=cyp_cnv_freqs_df,\n",
411+
" color_continuous_scale=\"Blues\",\n",
412+
" title=\"Cyp gene CNV frequencies\",\n",
413+
")"
414+
]
415+
},
416+
{
417+
"cell_type": "code",
418+
"execution_count": null,
419+
"id": "83aab417-632e-4fd2-8da4-3ffdd6e233f6",
420+
"metadata": {},
421+
"outputs": [],
384422
"source": []
385423
}
386424
],

tests/test_af1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from malariagen_data.util import locate_region, resolve_region
77

88

9-
def setup_af1(url="simplecache::gs://vo_afun_release/", **kwargs):
9+
def setup_af1(url="simplecache::gs://vo_afun_release_master_us_central1/", **kwargs):
1010
kwargs.setdefault("check_location", False)
1111
kwargs.setdefault("show_progress", False)
1212
if url is None:

tests/test_ag3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
contigs = "2R", "2L", "3R", "3L", "X"
1616

1717

18-
def setup_ag3(url="simplecache::gs://vo_agam_release/", **kwargs):
18+
def setup_ag3(url="simplecache::gs://vo_agam_release_master_us_central1/", **kwargs):
1919
kwargs.setdefault("check_location", False)
2020
kwargs.setdefault("show_progress", False)
2121
if url is None:

0 commit comments

Comments
 (0)