Skip to content

Commit deb5f9a

Browse files
committed
Solved the MRO issue
2 parents 5abc240 + 5cc504a commit deb5f9a

File tree

11 files changed

+198
-71
lines changed

11 files changed

+198
-71
lines changed

malariagen_data/anoph/fst.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,11 @@ def _fst_gwss(
8282
).compute()
8383

8484
with self._spinner(desc="Compute Fst"):
85-
fst = allel.moving_hudson_fst(ac1, ac2, size=window_size)
86-
# Sometimes Fst can be very slightly below zero, clip for simplicity.
87-
fst = np.clip(fst, a_min=clip_min, a_max=1)
88-
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
85+
with np.errstate(divide="ignore", invalid="ignore"):
86+
fst = allel.moving_hudson_fst(ac1, ac2, size=window_size)
87+
# Sometimes Fst can be very slightly below zero, clip for simplicity.
88+
fst = np.clip(fst, a_min=clip_min, a_max=1)
89+
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
8990

9091
results = dict(x=x, fst=fst)
9192

malariagen_data/anoph/gplt_params.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Parameters for genome plotting functions. N.B., genome plots are always
22
plotted with bokeh."""
33

4-
from typing import Literal, Mapping, Optional, Union, Sequence
4+
from typing import Literal, Mapping, Optional, Union, Final, Sequence
55

66
import bokeh.models
77
from typing_extensions import Annotated, TypeAlias
@@ -112,4 +112,11 @@
112112
"Passed through to bokeh line() function.",
113113
]
114114

115+
contig_colors: TypeAlias = Annotated[
116+
list[str],
117+
"A sequence of colors.",
118+
]
119+
120+
contig_colors_default: Final[contig_colors] = list(bokeh.palettes.d3["Category20b"][5])
121+
115122
colors: TypeAlias = Annotated[Sequence[str], "List of colors."]

malariagen_data/anoph/h12.py

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,16 @@ def _h12_gwss(
265265
# Compute window midpoints.
266266
pos = ds_haps["variant_position"].values
267267
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
268+
contigs = np.asarray(
269+
allel.moving_statistic(
270+
ds_haps["variant_contig"].values,
271+
statistic=np.median,
272+
size=window_size,
273+
),
274+
dtype=int,
275+
)
268276

269-
results = dict(x=x, h12=h12)
277+
results = dict(x=x, h12=h12, contigs=contigs)
270278

271279
return results
272280

@@ -276,6 +284,7 @@ def _h12_gwss(
276284
returns=dict(
277285
x="An array containing the window centre point genomic positions.",
278286
h12="An array with h12 statistic values for each window.",
287+
contigs="An array with the contig for each window. The median is chosen for windows overlapping a change of contig.",
279288
),
280289
)
281290
def h12_gwss(
@@ -296,10 +305,10 @@ def h12_gwss(
296305
random_seed: base_params.random_seed = 42,
297306
chunks: base_params.chunks = base_params.native_chunks,
298307
inline_array: base_params.inline_array = base_params.inline_array_default,
299-
) -> Tuple[np.ndarray, np.ndarray]:
308+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
300309
# Change this name if you ever change the behaviour of this function, to
301310
# invalidate any previously cached data.
302-
name = "h12_gwss_v1"
311+
name = "h12_gwss_v2"
303312

304313
params = dict(
305314
contig=contig,
@@ -326,8 +335,9 @@ def h12_gwss(
326335

327336
x = results["x"]
328337
h12 = results["h12"]
338+
contigs = results["contigs"]
329339

330-
return x, h12
340+
return x, h12, contigs
331341

332342
@check_types
333343
@doc(
@@ -353,14 +363,15 @@ def plot_h12_gwss_track(
353363
sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
354364
width: gplt_params.width = gplt_params.width_default,
355365
height: gplt_params.height = 200,
366+
contig_colors: gplt_params.contig_colors = gplt_params.contig_colors_default,
356367
show: gplt_params.show = True,
357368
x_range: Optional[gplt_params.x_range] = None,
358369
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
359370
chunks: base_params.chunks = base_params.native_chunks,
360371
inline_array: base_params.inline_array = base_params.inline_array_default,
361372
) -> gplt_params.figure:
362373
# Compute H12.
363-
x, h12 = self.h12_gwss(
374+
x, h12, contigs = self.h12_gwss(
364375
contig=contig,
365376
analysis=analysis,
366377
window_size=window_size,
@@ -411,15 +422,14 @@ def plot_h12_gwss_track(
411422
)
412423

413424
# Plot H12.
414-
fig.scatter(
415-
x=x,
416-
y=h12,
417-
marker="circle",
418-
size=3,
419-
line_width=1,
420-
line_color="black",
421-
fill_color=None,
422-
)
425+
for s in set(contigs):
426+
idxs = contigs == s
427+
fig.scatter(
428+
x=x[idxs],
429+
y=h12[idxs],
430+
marker="circle",
431+
color=contig_colors[s % len(contig_colors)],
432+
)
423433

424434
# Tidy up the plot.
425435
fig.yaxis.axis_label = "H12"
@@ -456,6 +466,7 @@ def plot_h12_gwss(
456466
sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
457467
width: gplt_params.width = gplt_params.width_default,
458468
track_height: gplt_params.track_height = 170,
469+
contig_colors: gplt_params.contig_colors = gplt_params.contig_colors_default,
459470
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
460471
show: gplt_params.show = True,
461472
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
@@ -478,6 +489,7 @@ def plot_h12_gwss(
478489
sizing_mode=sizing_mode,
479490
width=width,
480491
height=track_height,
492+
contig_colors=contig_colors,
481493
show=False,
482494
output_backend=output_backend,
483495
chunks=chunks,
@@ -574,7 +586,7 @@ def plot_h12_gwss_multi_overlay_track(
574586
)
575587

576588
# Determine X axis range.
577-
x, _ = res[list(cohort_queries.keys())[0]]
589+
x, _, _ = res[list(cohort_queries.keys())[0]]
578590
x_min = x[0]
579591
x_max = x[-1]
580592
if x_range is None:
@@ -609,7 +621,7 @@ def plot_h12_gwss_multi_overlay_track(
609621
)
610622

611623
# Plot H12.
612-
for i, (cohort_label, (x, h12)) in enumerate(res.items()):
624+
for i, (cohort_label, (x, h12, contig)) in enumerate(res.items()):
613625
fig.scatter(
614626
x=x,
615627
y=h12,

malariagen_data/anoph/h1x.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,16 @@ def _h1x_gwss(
8383
# Compute window midpoints.
8484
pos = ds1["variant_position"].values
8585
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
86+
contigs = np.asarray(
87+
allel.moving_statistic(
88+
ds1["variant_contig"].values,
89+
statistic=np.median,
90+
size=window_size,
91+
),
92+
dtype=int,
93+
)
8694

87-
results = dict(x=x, h1x=h1x)
95+
results = dict(x=x, h1x=h1x, contigs=contigs)
8896

8997
return results
9098

@@ -97,6 +105,7 @@ def _h1x_gwss(
97105
returns=dict(
98106
x="An array containing the window centre point genomic positions.",
99107
h1x="An array with H1X statistic values for each window.",
108+
contigs="An array with the contig for each window. The median is chosen for windows overlapping a change of contig.",
100109
),
101110
)
102111
def h1x_gwss(
@@ -118,10 +127,10 @@ def h1x_gwss(
118127
random_seed: base_params.random_seed = 42,
119128
chunks: base_params.chunks = base_params.native_chunks,
120129
inline_array: base_params.inline_array = base_params.inline_array_default,
121-
) -> Tuple[np.ndarray, np.ndarray]:
130+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
122131
# Change this name if you ever change the behaviour of this function, to
123132
# invalidate any previously cached data.
124-
name = "h1x_gwss_v1"
133+
name = "h1x_gwss_v2"
125134

126135
params = dict(
127136
contig=contig,
@@ -149,8 +158,9 @@ def h1x_gwss(
149158

150159
x = results["x"]
151160
h1x = results["h1x"]
161+
contigs = results["contigs"]
152162

153-
return x, h1x
163+
return x, h1x, contigs
154164

155165
@check_types
156166
@doc(
@@ -180,14 +190,15 @@ def plot_h1x_gwss_track(
180190
sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
181191
width: gplt_params.width = gplt_params.width_default,
182192
height: gplt_params.height = 200,
193+
contig_colors: gplt_params.contig_colors = gplt_params.contig_colors_default,
183194
show: gplt_params.show = True,
184195
x_range: Optional[gplt_params.x_range] = None,
185196
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
186197
chunks: base_params.chunks = base_params.native_chunks,
187198
inline_array: base_params.inline_array = base_params.inline_array_default,
188199
) -> gplt_params.figure:
189200
# Compute H1X.
190-
x, h1x = self.h1x_gwss(
201+
x, h1x, contigs = self.h1x_gwss(
191202
contig=contig,
192203
analysis=analysis,
193204
window_size=window_size,
@@ -239,15 +250,14 @@ def plot_h1x_gwss_track(
239250
)
240251

241252
# Plot H1X.
242-
fig.scatter(
243-
x=x,
244-
y=h1x,
245-
marker="circle",
246-
size=3,
247-
line_width=1,
248-
line_color="black",
249-
fill_color=None,
250-
)
253+
for s in set(contigs):
254+
idxs = contigs == s
255+
fig.scatter(
256+
x=x[idxs],
257+
y=h1x[idxs],
258+
marker="circle",
259+
color=contig_colors[s % len(contig_colors)],
260+
)
251261

252262
# Tidy up the plot.
253263
fig.yaxis.axis_label = "H1X"
@@ -288,6 +298,7 @@ def plot_h1x_gwss(
288298
sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
289299
width: gplt_params.width = gplt_params.width_default,
290300
track_height: gplt_params.track_height = 190,
301+
contig_colors: gplt_params.contig_colors = gplt_params.contig_colors_default,
291302
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
292303
show: gplt_params.show = True,
293304
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
@@ -311,6 +322,7 @@ def plot_h1x_gwss(
311322
sizing_mode=sizing_mode,
312323
width=width,
313324
height=track_height,
325+
contig_colors=contig_colors,
314326
show=False,
315327
output_backend=output_backend,
316328
chunks=chunks,

malariagen_data/anopheles.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -888,14 +888,24 @@ def _gene_cnv(
888888
chunks,
889889
inline_array,
890890
):
891-
debug = self._log.debug
892-
893-
debug("sanity check")
891+
# Sanity check.
894892
assert isinstance(region, Region)
895893

896-
debug("access HMM data")
894+
# Access genes within the region of interest.
895+
df_genome_features = self.genome_features(region=region)
896+
sample_query_options = sample_query_options or {}
897+
df_genes = df_genome_features.query(
898+
f"type == '{self._gff_gene_type}'", **sample_query_options
899+
)
900+
901+
# Refine the region for CNV data to ensure coverage of all requested genes.
902+
cnv_region = Region(
903+
region.contig, df_genes["start"].min(), df_genes["end"].max()
904+
)
905+
906+
# Access HMM data.
897907
ds_hmm = self.cnv_hmm(
898-
region=region.contig,
908+
region=cnv_region,
899909
sample_sets=sample_sets,
900910
sample_query=sample_query,
901911
sample_query_options=sample_query_options,
@@ -909,45 +919,38 @@ def _gene_cnv(
909919
with self._dask_progress(desc="Load CNV HMM data"):
910920
pos, end, cn = dask.compute(pos, end, cn)
911921

912-
debug("access genes")
913-
df_genome_features = self.genome_features(region=region)
914-
sample_query_options = sample_query_options or {}
915-
df_genes = df_genome_features.query(
916-
f"type == '{self._gff_gene_type}'", **sample_query_options
917-
)
918-
919-
debug("setup intermediates")
922+
# Set up intermediates.
920923
windows = []
921924
modes = []
922925
counts = []
923926

924-
debug("iterate over genes")
927+
# Iterate over genes.
925928
genes_iterator = self._progress(
926929
df_genes.itertuples(),
927930
desc="Compute modal gene copy number",
928931
total=len(df_genes),
929932
)
930933
for gene in genes_iterator:
931-
# locate windows overlapping the gene
934+
# Locate windows overlapping the gene.
932935
loc_gene_start = bisect_left(end, gene.start)
933936
loc_gene_stop = bisect_right(pos, gene.end)
934937
w = loc_gene_stop - loc_gene_start
935938
windows.append(w)
936939

937-
# slice out copy number data for the given gene
940+
# Slice out copy number data for the given gene.
938941
cn_gene = cn[loc_gene_start:loc_gene_stop]
939942

940-
# compute the modes
943+
# Compute the modes.
941944
m, c = _cn_mode(cn_gene, vmax=12)
942945
modes.append(m)
943946
counts.append(c)
944947

945-
debug("combine results")
948+
# Combine results.
946949
windows = np.array(windows)
947950
modes = np.vstack(modes)
948951
counts = np.vstack(counts)
949952

950-
debug("build dataset")
953+
# Build dataset.
951954
ds_out = xr.Dataset(
952955
coords={
953956
"gene_id": (["genes"], df_genes["ID"].values),

notebooks/plot_frequencies_heatmap.ipynb

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,44 @@
381381
"id": "86c5c594",
382382
"metadata": {},
383383
"outputs": [],
384+
"source": [
385+
"interesting_cyp_genes = [\n",
386+
" \"AGAP002862\", # Cyp6aa1\n",
387+
" \"AGAP013128\", # Cyp6aa2\n",
388+
" \"AGAP002865\", # Cyp6p3\n",
389+
" \"AGAP000818\", # Cyp9k1\n",
390+
" \"AGAP008212\", # Cyp6m2\n",
391+
" \"AGAP008218\", # Cyp6z2 \n",
392+
"]\n",
393+
"\n",
394+
"cyp_cnv_freqs_df = ag3.gene_cnv_frequencies(\n",
395+
" region=interesting_cyp_genes,\n",
396+
" cohorts=\"admin1_year\",\n",
397+
" sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"),\n",
398+
" sample_query=\"taxon == 'coluzzii'\",\n",
399+
")"
400+
]
401+
},
402+
{
403+
"cell_type": "code",
404+
"execution_count": null,
405+
"id": "6d7ad130-30c2-4cd3-8906-a7ada3ccc75f",
406+
"metadata": {},
407+
"outputs": [],
408+
"source": [
409+
"ag3.plot_frequencies_heatmap(\n",
410+
" df=cyp_cnv_freqs_df,\n",
411+
" color_continuous_scale=\"Blues\",\n",
412+
" title=\"Cyp gene CNV frequencies\",\n",
413+
")"
414+
]
415+
},
416+
{
417+
"cell_type": "code",
418+
"execution_count": null,
419+
"id": "83aab417-632e-4fd2-8da4-3ffdd6e233f6",
420+
"metadata": {},
421+
"outputs": [],
384422
"source": []
385423
}
386424
],

0 commit comments

Comments
 (0)