Skip to content

Commit e07f92c

Browse files
authored
Merge branch 'master' into plink-converter-2024-03-26-tristanpwdennis-shadow
2 parents b7632cc + 21988d4 commit e07f92c

File tree

5 files changed

+72
-22
lines changed

5 files changed

+72
-22
lines changed

malariagen_data/anoph/base.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,14 +327,18 @@ def _discover_releases(self) -> Tuple[str, ...]:
327327
)
328328
# Note: this matches v3, v3. and v3.1, but not v3001.1
329329
version_pattern = re.compile(f"^v{self._major_version_number}(\\..*)?$")
330+
# To sort the versions numerically, we use a lambda function for the "key" parameter of sorted().
331+
# The lambda function splits each version string into a list of its integer parts, using split('.') and int(), e.g. [3, 1],
332+
# which sorted() then uses to determine the order, as opposed to the default lexicographic order.
330333
discovered_releases = tuple(
331334
sorted(
332335
[
333336
self._path_to_release(d)
334337
for d in sub_dirs
335338
if version_pattern.match(d)
336339
and self._fs.exists(f"{self._base_path}/{d}/manifest.tsv")
337-
]
340+
],
341+
key=lambda v: [int(part) for part in v.split(".")],
338342
)
339343
)
340344
return discovered_releases

malariagen_data/anopheles.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -888,14 +888,24 @@ def _gene_cnv(
888888
chunks,
889889
inline_array,
890890
):
891-
debug = self._log.debug
892-
893-
debug("sanity check")
891+
# Sanity check.
894892
assert isinstance(region, Region)
895893

896-
debug("access HMM data")
894+
# Access genes within the region of interest.
895+
df_genome_features = self.genome_features(region=region)
896+
sample_query_options = sample_query_options or {}
897+
df_genes = df_genome_features.query(
898+
f"type == '{self._gff_gene_type}'", **sample_query_options
899+
)
900+
901+
# Refine the region for CNV data to ensure coverage of all requested genes.
902+
cnv_region = Region(
903+
region.contig, df_genes["start"].min(), df_genes["end"].max()
904+
)
905+
906+
# Access HMM data.
897907
ds_hmm = self.cnv_hmm(
898-
region=region.contig,
908+
region=cnv_region,
899909
sample_sets=sample_sets,
900910
sample_query=sample_query,
901911
sample_query_options=sample_query_options,
@@ -909,45 +919,38 @@ def _gene_cnv(
909919
with self._dask_progress(desc="Load CNV HMM data"):
910920
pos, end, cn = dask.compute(pos, end, cn)
911921

912-
debug("access genes")
913-
df_genome_features = self.genome_features(region=region)
914-
sample_query_options = sample_query_options or {}
915-
df_genes = df_genome_features.query(
916-
f"type == '{self._gff_gene_type}'", **sample_query_options
917-
)
918-
919-
debug("setup intermediates")
922+
# Set up intermediates.
920923
windows = []
921924
modes = []
922925
counts = []
923926

924-
debug("iterate over genes")
927+
# Iterate over genes.
925928
genes_iterator = self._progress(
926929
df_genes.itertuples(),
927930
desc="Compute modal gene copy number",
928931
total=len(df_genes),
929932
)
930933
for gene in genes_iterator:
931-
# locate windows overlapping the gene
934+
# Locate windows overlapping the gene.
932935
loc_gene_start = bisect_left(end, gene.start)
933936
loc_gene_stop = bisect_right(pos, gene.end)
934937
w = loc_gene_stop - loc_gene_start
935938
windows.append(w)
936939

937-
# slice out copy number data for the given gene
940+
# Slice out copy number data for the given gene.
938941
cn_gene = cn[loc_gene_start:loc_gene_stop]
939942

940-
# compute the modes
943+
# Compute the modes.
941944
m, c = _cn_mode(cn_gene, vmax=12)
942945
modes.append(m)
943946
counts.append(c)
944947

945-
debug("combine results")
948+
# Combine results.
946949
windows = np.array(windows)
947950
modes = np.vstack(modes)
948951
counts = np.vstack(counts)
949952

950-
debug("build dataset")
953+
# Build dataset.
951954
ds_out = xr.Dataset(
952955
coords={
953956
"gene_id": (["genes"], df_genes["ID"].values),
@@ -1182,6 +1185,11 @@ def _gene_cnv_frequencies(
11821185

11831186
freq_cols[f"frq_{coh}"] = np.concatenate([amp_freq_coh, del_freq_coh])
11841187

1188+
if len(coh_dict) == 0:
1189+
raise ValueError(
1190+
"No cohorts available for the given sample selection parameters and minimum cohort size."
1191+
)
1192+
11851193
debug("build a dataframe with the frequency columns")
11861194
df_freqs = pd.DataFrame(freq_cols)
11871195

notebooks/plot_frequencies_heatmap.ipynb

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,44 @@
381381
"id": "86c5c594",
382382
"metadata": {},
383383
"outputs": [],
384+
"source": [
385+
"interesting_cyp_genes = [\n",
386+
" \"AGAP002862\", # Cyp6aa1\n",
387+
" \"AGAP013128\", # Cyp6aa2\n",
388+
" \"AGAP002865\", # Cyp6p3\n",
389+
" \"AGAP000818\", # Cyp9k1\n",
390+
" \"AGAP008212\", # Cyp6m2\n",
391+
" \"AGAP008218\", # Cyp6z2 \n",
392+
"]\n",
393+
"\n",
394+
"cyp_cnv_freqs_df = ag3.gene_cnv_frequencies(\n",
395+
" region=interesting_cyp_genes,\n",
396+
" cohorts=\"admin1_year\",\n",
397+
" sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"),\n",
398+
" sample_query=\"taxon == 'coluzzii'\",\n",
399+
")"
400+
]
401+
},
402+
{
403+
"cell_type": "code",
404+
"execution_count": null,
405+
"id": "6d7ad130-30c2-4cd3-8906-a7ada3ccc75f",
406+
"metadata": {},
407+
"outputs": [],
408+
"source": [
409+
"ag3.plot_frequencies_heatmap(\n",
410+
" df=cyp_cnv_freqs_df,\n",
411+
" color_continuous_scale=\"Blues\",\n",
412+
" title=\"Cyp gene CNV frequencies\",\n",
413+
")"
414+
]
415+
},
416+
{
417+
"cell_type": "code",
418+
"execution_count": null,
419+
"id": "83aab417-632e-4fd2-8da4-3ffdd6e233f6",
420+
"metadata": {},
421+
"outputs": [],
384422
"source": []
385423
}
386424
],

tests/test_af1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from malariagen_data.util import locate_region, resolve_region
77

88

9-
def setup_af1(url="simplecache::gs://vo_afun_release/", **kwargs):
9+
def setup_af1(url="simplecache::gs://vo_afun_release_master_us_central1/", **kwargs):
1010
kwargs.setdefault("check_location", False)
1111
kwargs.setdefault("show_progress", False)
1212
if url is None:

tests/test_ag3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
contigs = "2R", "2L", "3R", "3L", "X"
1616

1717

18-
def setup_ag3(url="simplecache::gs://vo_agam_release/", **kwargs):
18+
def setup_ag3(url="simplecache::gs://vo_agam_release_master_us_central1/", **kwargs):
1919
kwargs.setdefault("check_location", False)
2020
kwargs.setdefault("show_progress", False)
2121
if url is None:

0 commit comments

Comments
 (0)