From 837b1de04157ffe023a75c7efb1b9abbfe7839ed Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Fri, 5 Sep 2025 22:05:03 +0100 Subject: [PATCH 01/15] add advanced haplotype clustering --- malariagen_data/anoph/dipclust.py | 6 +- malariagen_data/anoph/hapclust.py | 374 +++++++++++++++++++++++++++++- 2 files changed, 372 insertions(+), 8 deletions(-) diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py index b96ee4283..043019ec4 100644 --- a/malariagen_data/anoph/dipclust.py +++ b/malariagen_data/anoph/dipclust.py @@ -22,13 +22,9 @@ dipclust_params, cnv_params, ) -from .snp_frq import AnophelesSnpFrequencyAnalysis +from .snp_frq import AnophelesSnpFrequencyAnalysis, AA_CHANGE_QUERY from .cnv_frq import AnophelesCnvFrequencyAnalysis -AA_CHANGE_QUERY = ( - "effect in ['NON_SYNONYMOUS_CODING', 'START_LOST', 'STOP_LOST', 'STOP_GAINED']" -) - class AnophelesDipClustAnalysis( AnophelesCnvFrequencyAnalysis, AnophelesSnpFrequencyAnalysis diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index 05eb6acb8..bb9d80369 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -16,6 +16,7 @@ hapclust_params, ) from .snp_data import AnophelesSnpData +from .snp_frq import AA_CHANGE_QUERY from .hap_data import AnophelesHapData @@ -127,6 +128,7 @@ def plot_haplotype_clustering( # Repeat the dataframe so there is one row of metadata for each haplotype. df_haps = pd.DataFrame(np.repeat(df_samples_phased.values, 2, axis=0)) df_haps.columns = df_samples_phased.columns + leaf_data = df_haps.assign(sample_id=_make_unique(df_haps.sample_id)) # Configure hover data. hover_data = self._setup_sample_hover_data_plotly( @@ -145,7 +147,7 @@ def plot_haplotype_clustering( # Create the plot. with self._spinner("Plot dendrogram"): - fig, _ = plot_dendrogram( + fig, leaf_data = plot_dendrogram( dist=dist, linkage_method=linkage_method, count_sort=count_sort, @@ -157,7 +159,7 @@ def plot_haplotype_clustering( line_width=line_width, line_color=line_color, marker_size=marker_size, - leaf_data=df_haps, + leaf_data=leaf_data, leaf_hover_name="sample_id", leaf_hover_data=hover_data, leaf_color=color_prepped, @@ -182,7 +184,13 @@ def plot_haplotype_clustering( fig.show(renderer=renderer) return None else: - return fig + return { + "figure": fig, + "n_snps": n_snps_used, + "dist": dist, + "dist_samples": phased_samples, + "leaf_data":leaf_data, + } @doc( summary=""" @@ -292,3 +300,363 @@ def _haplotype_pairwise_distances( phased_samples=phased_samples, n_snps=np.array(ht.shape[0]), ) + + def plot_haplotype_clustering_advanced( + self, + region, + analysis, + snp_transcript=None, + snp_colorscale="Greys", + snp_filter_min_maf=0.05, + snp_query=AA_CHANGE_QUERY, + sample_sets=None, + sample_query=None, + sample_query_options=None, + random_seed=42, + cohort_size=None, + cut_height=None, + min_cluster_size=None, + color=None, + symbol=None, + linkage_method="complete", + count_sort=None, + distance_sort=None, + title=True, + title_font_size=14, + width=None, + dendrogram_height=300, + snp_row_height=25, + show=True, + renderer=None, + render_mode="svg", + leaf_y=0, + marker_size=5, + line_width=0.5, + line_color="black", + color_discrete_sequence=None, + color_discrete_map=None, + category_orders=None, + legend_sizing="constant", + chunks="native", + inline_array=True, + ): + import plotly.express as px + import plotly.graph_objects as go + + if cohort_size and snp_transcript: + cohort_size = None + print( + "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None." + ) + + res = self.plot_haplotype_clustering( + region=region, + analysis=analysis, + sample_sets=sample_sets, + sample_query=sample_query, + sample_query_options=sample_query_options, + count_sort=count_sort, + cohort_size=cohort_size, + distance_sort=distance_sort, + linkage_method=linkage_method, + color=color, + symbol=symbol, + title=title, + title_font_size=title_font_size, + width=width, + height=dendrogram_height, + show=False, + renderer=renderer, + render_mode=render_mode, + leaf_y=leaf_y, + marker_size=marker_size, + line_width=line_width, + line_color=line_color, + color_discrete_sequence=color_discrete_sequence, + color_discrete_map=color_discrete_map, + category_orders=category_orders, + legend_sizing=legend_sizing, + random_seed=random_seed, + chunks=chunks, + inline_array=inline_array, + ) + + fig_dendro = res["figure"] + n_snps_cluster = res["n_snps"] + leaf_data = res["leaf_data"] + dendro_sample_id_order = np.asarray(leaf_data["sample_id"].to_list()) + + figures = [fig_dendro] + subplot_heights = [dendrogram_height] + + if cut_height and min_cluster_size: + cluster_col_list = px.colors.sequential.Turbo.copy() + cluster_col_list.insert(0, 'white') + + df_clusters = self.cut_dist_tree( + dist=res['dist'], + dist_samples=_make_unique(np.repeat(res['dist_samples'], 2)), + dendro_sample_id_order=dendro_sample_id_order, + linkage_method=linkage_method, + cut_height=cut_height, + min_cluster_size=min_cluster_size, + count_sort=count_sort, + distance_sort=distance_sort + ) + + leaf_data = leaf_data.merge(df_clusters.T.reset_index()) + + snp_trace = go.Heatmap( + z=df_clusters.values, + y=df_clusters.index.to_list(), + colorscale=cluster_col_list, + showlegend=False, + showscale=False, + ) + figures.append(snp_trace) + subplot_heights.append(25) + + + figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace( + transcript=snp_transcript, + snp_query=snp_query, + figures=figures, + subplot_heights=subplot_heights, + sample_sets=sample_sets, + sample_query=sample_query, + analysis=analysis, + dendro_sample_id_order=dendro_sample_id_order, + snp_filter_min_maf=snp_filter_min_maf, + snp_colorscale=snp_colorscale, + snp_row_height=snp_row_height, + chunks=chunks, + inline_array=inline_array, + ) + n_aa = df_haps.shape[0] + + # Calculate total height based on subplot heights, plus a fixed + # additional component to allow for title, axes etc. + height = sum(subplot_heights) + 50 + fig = self._dipclust_concat_subplots( + figures=figures, + width=width, + height=height, + row_heights=subplot_heights, + sample_sets=sample_sets, + sample_query=sample_query, # Only uses query for title. + region=region, + n_snps=n_snps_cluster, + ) + + fig["layout"]["yaxis"]["title"] = f"Distance" + fig.update_layout( + title_font=dict( + size=title_font_size, + ), + legend=dict(itemsizing=legend_sizing, tracegroupgap=0), + ) + + if snp_transcript: + # add lines to aa plot + aa_idx = len(figures) + fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1) + for i in range(n_aa): + fig.add_hline(y=i+0.5, line_width=1, line_color="grey", row=aa_idx, col=1) + + fig.update_xaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True) + fig.update_yaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True) + + if show: + fig.show(renderer=renderer) + return None + else: + return fig, leaf_data, df_haps + + + def transcript_haplotypes( + self, + transcript, + sample_sets, + sample_query, + analysis, + snp_query, + chunks, + inline_array + ): + """ + Extract haplotype calls for a given transcript. + """ + + # Get SNP genotype allele counts for the transcript, applying snp_query + df_snps = self.snp_genotype_allele_counts( + transcript=transcript, + sample_query=sample_query, + sample_sets=sample_sets, + site_mask=None, + snp_query=snp_query, + chunks=chunks, + inline_array=inline_array + ) + + # Add a unique variant identifier: "position-alt_allele" + df_snps = df_snps.assign( + pos_alt=lambda x: x.position.astype(str) + "-" + x.alt_allele + ) + + # Get haplotypes for the transcript + ds_haps = self.haplotypes( + region=transcript, + sample_sets=sample_sets, + sample_query=sample_query, + analysis=analysis, + chunks=chunks, + inline_array=inline_array + ) + + # Convert genotype calls to haplotypes + haps = allel.GenotypeArray(ds_haps["call_genotype"].values).to_haplotypes() + h_pos = allel.SortedIndex(ds_haps["variant_position"].values) + h_alts = ds_haps["variant_allele"].values.astype(str)[:, 1] + h_pos_alts = np.array([f"{pos}-{h_alts[i]}" for i, pos in enumerate(h_pos)]) + sample_ids = ds_haps["sample_id"].values + + # Filter haplotypes to SNPs present in df_snps + df_snps = df_snps.query("pos_alt in @h_pos_alts") + label = df_snps.label.values + haps_bool = np.isin(h_pos_alts, df_snps.pos_alt) + haps = haps.compress(haps_bool) + + # Build haplotype DataFrame + df_haps = pd.DataFrame( + haps, + index=label, + columns=_make_unique(np.repeat(sample_ids, 2)) # two haplotypes per sample + ) + + return df_haps + + def _insert_hapclust_snp_trace( + self, + figures, + subplot_heights, + transcript, + analysis, + dendro_sample_id_order: np.ndarray, + snp_filter_min_maf: float, + snp_colorscale, + snp_query, + snp_row_height, + sample_sets, + sample_query, + chunks, + inline_array, + ): + from plotly import graph_objects as go + + # load genotype allele counts at SNP variants for each sample + df_haps = self.transcript_haplotypes( + transcript=transcript, + snp_query=snp_query, + sample_query=sample_query, + sample_sets=sample_sets, + analysis=analysis, + chunks=chunks, + inline_array=inline_array + ) + + # set to diplotype cluster order + df_haps = df_haps.loc[ + :, dendro_sample_id_order + ] + + if snp_filter_min_maf: + df_haps = df_haps.assign(af=lambda x: x.sum(axis=1) / x.shape[1]) + df_haps = df_haps.query("af > @snp_filter_min_maf").drop(columns="af") + + if not df_haps.empty: + snp_trace = go.Heatmap( + z=df_haps.values, + y=df_haps.index.to_list(), + colorscale=snp_colorscale, + showlegend=False, + showscale=False, + ) + else: + snp_trace = None + + if snp_trace: + figures.append(snp_trace) + subplot_heights.append(snp_row_height * df_haps.shape[0]) + else: + print( + f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot." + ) + return figures, subplot_heights, df_haps + + + def cut_dist_tree(self, dist, dist_samples, dendro_sample_id_order, linkage_method, cut_height, min_cluster_size, count_sort, distance_sort): + """ + Create a one-row DataFrame with haplotype_ids as columns and cluster assignments as values + + Parameters: + ----------- + dist : ndarray + distance array + dist_samples : array-like + List/array of individual identifiers (haplotype_ids) + linkage_method : str + Method used to calculate the linkage matrix + cut_height : float + Height at which to cut the dendrogram + min_cluster_size : int, default=1 + Minimum number of individuals required in a cluster to be included + count_sort : bool + Whether to sort clusters by count + distance_sort : bool + Whether to sort clusters by distance + + Returns: + -------- + pd.DataFrame + One-row DataFrame with haplotype_ids as columns and assigned cluster numbers (1...n) as values + """ + from scipy.cluster.hierarchy import linkage, fcluster + + Z = linkage(dist, method=linkage_method) + + # Get cluster assignments for each individual + cluster_assignments = fcluster( + Z, cut_height, criterion='distance' + ) + + # Create initial DataFrame + df = pd.DataFrame({ + 'sample_id': dist_samples, + 'cluster_id': _filter_and_remap(cluster_assignments, x=min_cluster_size) + }).set_index('sample_id').T.loc[:, dendro_sample_id_order] + + return df + +def _filter_and_remap(arr, x): + from collections import Counter + # Get unique values that appear >= x times + valid_values = [val for val, count in Counter(arr).items() if count >= x] + # Create mapping to 1, 2, 3, ..., n + mapping = {val: i + 1 for i, val in enumerate(sorted(valid_values))} + # Apply transformation + return np.array([mapping.get(val, 0) for val in arr]) + + +def _make_unique(values): + value_counts = {} + unique_values = [] + + for value in values: + if value in value_counts: + value_counts[value] += 1 + unique_values.append(f"{value}_{value_counts[value]}") + else: + value_counts[value] = 0 + unique_values.append(f"{value}_{value_counts[value]}") + + return np.array(unique_values) \ No newline at end of file From df2bb6c6bce0d6426a764acb60cb25ec650f65ec Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Fri, 5 Sep 2025 22:32:05 +0100 Subject: [PATCH 02/15] use snp_effects() --- malariagen_data/anoph/hapclust.py | 139 +++++++++++++++++------------- 1 file changed, 81 insertions(+), 58 deletions(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index bb9d80369..cf1134e1c 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -189,7 +189,7 @@ def plot_haplotype_clustering( "n_snps": n_snps_used, "dist": dist, "dist_samples": phased_samples, - "leaf_data":leaf_data, + "leaf_data": leaf_data, } @doc( @@ -388,24 +388,24 @@ def plot_haplotype_clustering_advanced( figures = [fig_dendro] subplot_heights = [dendrogram_height] - + if cut_height and min_cluster_size: cluster_col_list = px.colors.sequential.Turbo.copy() - cluster_col_list.insert(0, 'white') - + cluster_col_list.insert(0, "white") + df_clusters = self.cut_dist_tree( - dist=res['dist'], - dist_samples=_make_unique(np.repeat(res['dist_samples'], 2)), + dist=res["dist"], + dist_samples=_make_unique(np.repeat(res["dist_samples"], 2)), dendro_sample_id_order=dendro_sample_id_order, - linkage_method=linkage_method, + linkage_method=linkage_method, cut_height=cut_height, - min_cluster_size=min_cluster_size, - count_sort=count_sort, - distance_sort=distance_sort + min_cluster_size=min_cluster_size, + count_sort=count_sort, + distance_sort=distance_sort, ) leaf_data = leaf_data.merge(df_clusters.T.reset_index()) - + snp_trace = go.Heatmap( z=df_clusters.values, y=df_clusters.index.to_list(), @@ -415,8 +415,7 @@ def plot_haplotype_clustering_advanced( ) figures.append(snp_trace) subplot_heights.append(25) - - + figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace( transcript=snp_transcript, snp_query=snp_query, @@ -448,31 +447,46 @@ def plot_haplotype_clustering_advanced( n_snps=n_snps_cluster, ) - fig["layout"]["yaxis"]["title"] = f"Distance" + fig["layout"]["yaxis"]["title"] = "Distance" fig.update_layout( title_font=dict( size=title_font_size, ), legend=dict(itemsizing=legend_sizing, tracegroupgap=0), ) - + if snp_transcript: - # add lines to aa plot + # add lines to aa plot aa_idx = len(figures) fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1) for i in range(n_aa): - fig.add_hline(y=i+0.5, line_width=1, line_color="grey", row=aa_idx, col=1) - - fig.update_xaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True) - fig.update_yaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True) + fig.add_hline( + y=i + 0.5, line_width=1, line_color="grey", row=aa_idx, col=1 + ) + + fig.update_xaxes( + showline=True, + linecolor="grey", + linewidth=1, + row=aa_idx, + col=1, + mirror=True, + ) + fig.update_yaxes( + showline=True, + linecolor="grey", + linewidth=1, + row=aa_idx, + col=1, + mirror=True, + ) if show: fig.show(renderer=renderer) return None else: return fig, leaf_data, df_haps - - + def transcript_haplotypes( self, transcript, @@ -481,25 +495,19 @@ def transcript_haplotypes( analysis, snp_query, chunks, - inline_array + inline_array, ): """ Extract haplotype calls for a given transcript. """ - + # Get SNP genotype allele counts for the transcript, applying snp_query - df_snps = self.snp_genotype_allele_counts( + df_eff = self.snp_effects( transcript=transcript, - sample_query=sample_query, - sample_sets=sample_sets, - site_mask=None, - snp_query=snp_query, - chunks=chunks, - inline_array=inline_array ) # Add a unique variant identifier: "position-alt_allele" - df_snps = df_snps.assign( + df_eff = df_eff.assign( pos_alt=lambda x: x.position.astype(str) + "-" + x.alt_allele ) @@ -510,7 +518,7 @@ def transcript_haplotypes( sample_query=sample_query, analysis=analysis, chunks=chunks, - inline_array=inline_array + inline_array=inline_array, ) # Convert genotype calls to haplotypes @@ -520,17 +528,17 @@ def transcript_haplotypes( h_pos_alts = np.array([f"{pos}-{h_alts[i]}" for i, pos in enumerate(h_pos)]) sample_ids = ds_haps["sample_id"].values - # Filter haplotypes to SNPs present in df_snps - df_snps = df_snps.query("pos_alt in @h_pos_alts") - label = df_snps.label.values - haps_bool = np.isin(h_pos_alts, df_snps.pos_alt) + # Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff + df_eff = df_eff.query("pos_alt in @h_pos_alts") + label = df_eff.label.values + haps_bool = np.isin(h_pos_alts, df_eff.pos_alt) haps = haps.compress(haps_bool) # Build haplotype DataFrame df_haps = pd.DataFrame( haps, index=label, - columns=_make_unique(np.repeat(sample_ids, 2)) # two haplotypes per sample + columns=_make_unique(np.repeat(sample_ids, 2)), # two haplotypes per sample ) return df_haps @@ -552,7 +560,7 @@ def _insert_hapclust_snp_trace( inline_array, ): from plotly import graph_objects as go - + # load genotype allele counts at SNP variants for each sample df_haps = self.transcript_haplotypes( transcript=transcript, @@ -561,13 +569,11 @@ def _insert_hapclust_snp_trace( sample_sets=sample_sets, analysis=analysis, chunks=chunks, - inline_array=inline_array + inline_array=inline_array, ) - + # set to diplotype cluster order - df_haps = df_haps.loc[ - :, dendro_sample_id_order - ] + df_haps = df_haps.loc[:, dendro_sample_id_order] if snp_filter_min_maf: df_haps = df_haps.assign(af=lambda x: x.sum(axis=1) / x.shape[1]) @@ -583,7 +589,7 @@ def _insert_hapclust_snp_trace( ) else: snp_trace = None - + if snp_trace: figures.append(snp_trace) subplot_heights.append(snp_row_height * df_haps.shape[0]) @@ -593,8 +599,17 @@ def _insert_hapclust_snp_trace( ) return figures, subplot_heights, df_haps - - def cut_dist_tree(self, dist, dist_samples, dendro_sample_id_order, linkage_method, cut_height, min_cluster_size, count_sort, distance_sort): + def cut_dist_tree( + self, + dist, + dist_samples, + dendro_sample_id_order, + linkage_method, + cut_height, + min_cluster_size, + count_sort, + distance_sort, + ): """ Create a one-row DataFrame with haplotype_ids as columns and cluster assignments as values @@ -625,20 +640,28 @@ def cut_dist_tree(self, dist, dist_samples, dendro_sample_id_order, linkage_meth Z = linkage(dist, method=linkage_method) # Get cluster assignments for each individual - cluster_assignments = fcluster( - Z, cut_height, criterion='distance' - ) - + cluster_assignments = fcluster(Z, cut_height, criterion="distance") + # Create initial DataFrame - df = pd.DataFrame({ - 'sample_id': dist_samples, - 'cluster_id': _filter_and_remap(cluster_assignments, x=min_cluster_size) - }).set_index('sample_id').T.loc[:, dendro_sample_id_order] + df = ( + pd.DataFrame( + { + "sample_id": dist_samples, + "cluster_id": _filter_and_remap( + cluster_assignments, x=min_cluster_size + ), + } + ) + .set_index("sample_id") + .T.loc[:, dendro_sample_id_order] + ) return df + def _filter_and_remap(arr, x): from collections import Counter + # Get unique values that appear >= x times valid_values = [val for val, count in Counter(arr).items() if count >= x] # Create mapping to 1, 2, 3, ..., n @@ -650,7 +673,7 @@ def _filter_and_remap(arr, x): def _make_unique(values): value_counts = {} unique_values = [] - + for value in values: if value in value_counts: value_counts[value] += 1 @@ -658,5 +681,5 @@ def _make_unique(values): else: value_counts[value] = 0 unique_values.append(f"{value}_{value_counts[value]}") - - return np.array(unique_values) \ No newline at end of file + + return np.array(unique_values) From eadb8b12c5e1e3f0e255000332edeadc6b68dbd3 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Fri, 5 Sep 2025 22:40:46 +0100 Subject: [PATCH 03/15] add labels --- malariagen_data/anoph/hapclust.py | 10 +- notebooks/plot_haplotype_clustering.ipynb | 194 ++++++++++++++++++++-- 2 files changed, 192 insertions(+), 12 deletions(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index cf1134e1c..10194b29a 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -5,7 +5,7 @@ import pandas as pd from numpydoc_decorator import doc # type: ignore -from ..util import CacheMiss, check_types, pdist_abs_hamming +from ..util import CacheMiss, check_types, pdist_abs_hamming, pandas_apply from ..plotly_dendrogram import plot_dendrogram from . import ( base_params, @@ -16,7 +16,7 @@ hapclust_params, ) from .snp_data import AnophelesSnpData -from .snp_frq import AA_CHANGE_QUERY +from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect from .hap_data import AnophelesHapData @@ -504,6 +504,12 @@ def transcript_haplotypes( # Get SNP genotype allele counts for the transcript, applying snp_query df_eff = self.snp_effects( transcript=transcript, + ).query(snp_query) + + df_eff["label"] = pandas_apply( + _make_snp_label_effect, + df_eff, + columns=["contig", "position", "ref_allele", "alt_allele", "aa_change"], ) # Add a unique variant identifier: "position-alt_allele" diff --git a/notebooks/plot_haplotype_clustering.ipynb b/notebooks/plot_haplotype_clustering.ipynb index c6dbdf4f3..fd344a91e 100644 --- a/notebooks/plot_haplotype_clustering.ipynb +++ b/notebooks/plot_haplotype_clustering.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d3f73666", "metadata": {}, "outputs": [], @@ -21,10 +21,109 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "38abbb28", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/javascript": "'use strict';\n(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded(error = null) {\n const el = document.getElementById(null);\n if (el != null) {\n const html = (() => {\n if (typeof root.Bokeh === \"undefined\") {\n if (error == null) {\n return \"BokehJS is loading ...\";\n } else {\n return \"BokehJS failed to load.\";\n }\n } else {\n const prefix = `BokehJS ${root.Bokeh.version}`;\n if (error == null) {\n return `${prefix} successfully loaded.`;\n } else {\n return `${prefix} encountered errors while loading and may not function as expected.`;\n }\n }\n })();\n el.innerHTML = html;\n\n if (error != null) {\n const wrapper = document.createElement(\"div\");\n wrapper.style.overflow = \"auto\";\n wrapper.style.height = \"5em\";\n wrapper.style.resize = \"vertical\";\n const content = document.createElement(\"div\");\n content.style.fontFamily = \"monospace\";\n content.style.whiteSpace = \"pre-wrap\";\n content.style.backgroundColor = \"rgb(255, 221, 221)\";\n content.textContent = error.stack ?? error.toString();\n wrapper.append(content);\n el.append(wrapper);\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(() => display_loaded(error), 100);\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.6.3.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n try {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n\n } catch (error) {throw error;\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", + "application/vnd.bokehjs_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MalariaGEN Ag3 API client
\n", + " Please note that data are subject to terms of use,\n", + " for more information see \n", + " the MalariaGEN website or contact support@malariagen.net.\n", + " See also the Ag3 API docs.\n", + "
\n", + " Storage URL\n", + " simplecache::gs://vo_agam_release_master_us_central1
\n", + " Data releases available\n", + " 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16
\n", + " Results cache\n", + " /home/sanj/Software/malariagen-data-python/notebooks/results_cache
\n", + " Cohorts analysis\n", + " 20250502
\n", + " AIM analysis\n", + " 20220528
\n", + " Site filters analysis\n", + " dt_20200416
\n", + " Software version\n", + " malariagen_data 15.4.0.post6+f591dba1
\n", + " Client location\n", + " England, United Kingdom
\n", + " " + ], + "text/plain": [ + "\n", + "Storage URL : simplecache::gs://vo_agam_release_master_us_central1\n", + "Data releases available : 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16\n", + "Results cache : /home/sanj/Software/malariagen-data-python/notebooks/results_cache\n", + "Cohorts analysis : 20250502\n", + "AIM analysis : 20220528\n", + "Site filters analysis : dt_20200416\n", + "Software version : malariagen_data 15.4.0.post6+f591dba1\n", + "Client location : England, United Kingdom\n", + "---\n", + "Please note that data are subject to terms of use,\n", + "for more information see https://www.malariagen.net/data\n", + "or contact support@malariagen.net. For API documentation see \n", + "https://malariagen.github.io/malariagen-data-python/v15.4.0.post6+f591dba1/Ag3.html" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ag3 = malariagen_data.Ag3(\n", " \"simplecache::gs://vo_agam_release_master_us_central1\",\n", @@ -327,6 +426,86 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": 5, + "id": "51846e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4daea74540a74823be0621dcc0afd499", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Load haplotypes: 0%| | 0/67 [00:00= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n", + " warnings.warn(problem)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'label'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[32m/tmp/ipykernel_537773/530656911.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m ag3.plot_haplotype_clustering_advanced(\n\u001b[32m 2\u001b[39m region=\u001b[33m\"2R:28,480,000-28,490,000\"\u001b[39m,\n\u001b[32m 3\u001b[39m sample_sets=[\u001b[33m\"3.0\"\u001b[39m],\n\u001b[32m 4\u001b[39m sample_query=\u001b[33m\"taxon == 'arabiensis'\"\u001b[39m,\n", + "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, region, analysis, snp_transcript, snp_colorscale, snp_filter_min_maf, snp_query, sample_sets, sample_query, sample_query_options, random_seed, cohort_size, cut_height, min_cluster_size, color, symbol, linkage_method, count_sort, distance_sort, title, title_font_size, width, dendrogram_height, snp_row_height, show, renderer, render_mode, leaf_y, marker_size, line_width, line_color, color_discrete_sequence, color_discrete_map, category_orders, legend_sizing, chunks, inline_array)\u001b[39m\n\u001b[32m 415\u001b[39m )\n\u001b[32m 416\u001b[39m figures.append(snp_trace)\n\u001b[32m 417\u001b[39m subplot_heights.append(\u001b[32m25\u001b[39m)\n\u001b[32m 418\u001b[39m \n\u001b[32m--> \u001b[39m\u001b[32m419\u001b[39m figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace(\n\u001b[32m 420\u001b[39m transcript=snp_transcript,\n\u001b[32m 421\u001b[39m snp_query=snp_query,\n\u001b[32m 422\u001b[39m figures=figures,\n", + "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, figures, subplot_heights, transcript, analysis, dendro_sample_id_order, snp_filter_min_maf, snp_colorscale, snp_query, snp_row_height, sample_sets, sample_query, chunks, inline_array)\u001b[39m\n\u001b[32m 561\u001b[39m ):\n\u001b[32m 562\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m plotly \u001b[38;5;28;01mimport\u001b[39;00m graph_objects \u001b[38;5;28;01mas\u001b[39;00m go\n\u001b[32m 563\u001b[39m \n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# load genotype allele counts at SNP variants for each sample\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m565\u001b[39m df_haps = self.transcript_haplotypes(\n\u001b[32m 566\u001b[39m transcript=transcript,\n\u001b[32m 567\u001b[39m snp_query=snp_query,\n\u001b[32m 568\u001b[39m sample_query=sample_query,\n", + "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, transcript, sample_sets, sample_query, analysis, snp_query, chunks, inline_array)\u001b[39m\n\u001b[32m 529\u001b[39m sample_ids = ds_haps[\u001b[33m\"sample_id\"\u001b[39m].values\n\u001b[32m 530\u001b[39m \n\u001b[32m 531\u001b[39m \u001b[38;5;66;03m# Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff\u001b[39;00m\n\u001b[32m 532\u001b[39m df_eff = df_eff.query(\u001b[33m\"pos_alt in @h_pos_alts\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m533\u001b[39m label = df_eff.label.values\n\u001b[32m 534\u001b[39m haps_bool = np.isin(h_pos_alts, df_eff.pos_alt)\n\u001b[32m 535\u001b[39m haps = haps.compress(haps_bool)\n\u001b[32m 536\u001b[39m \n", + "\u001b[32m~/apps/miniforge3/lib/python3.12/site-packages/pandas/core/generic.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 6295\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m name \u001b[38;5;28;01mnot\u001b[39;00m \u001b[38;5;28;01min\u001b[39;00m self._accessors\n\u001b[32m 6296\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m self._info_axis._can_hold_identifiers_and_holds_name(name)\n\u001b[32m 6297\u001b[39m ):\n\u001b[32m 6298\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m self[name]\n\u001b[32m-> \u001b[39m\u001b[32m6299\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m object.__getattribute__(self, name)\n", + "\u001b[31mAttributeError\u001b[39m: 'DataFrame' object has no attribute 'label'" + ] + } + ], + "source": [ + "ag3.plot_haplotype_clustering_advanced(\n", + " region=\"2R:28,480,000-28,490,000\",\n", + " sample_sets=[\"3.0\"],\n", + " sample_query=\"taxon == 'arabiensis'\",\n", + " analysis=\"gamb_colu_arab\",\n", + " # symbol=new_cohorts,\n", + " color=\"year\",\n", + " cohort_size=None,\n", + " snp_transcript=\"AGAP002862-RA\",\n", + " # width=1000,\n", + " # height=400,\n", + ")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -421,7 +600,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -435,12 +614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" - }, - "vscode": { - "interpreter": { - "hash": "3b9ddb1005cd06989fd869b9e3d566470f1be01faa610bb17d64e58e32302e8b" - } + "version": "3.12.8" }, "widgets": { "application/vnd.jupyter.widget-state+json": { From 510b639724bed155a63cff1b9770d313eb911060 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Sun, 7 Sep 2025 16:18:58 +0100 Subject: [PATCH 04/15] fixes for hap data --- malariagen_data/anoph/hapclust.py | 83 ++++++--- malariagen_data/anoph/hapclust_params.py | 12 ++ notebooks/plot_haplotype_clustering.ipynb | 197 +++------------------- 3 files changed, 93 insertions(+), 199 deletions(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index 10194b29a..1a6532ee6 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -48,6 +48,7 @@ def plot_haplotype_clustering( color: plotly_params.color = None, symbol: plotly_params.symbol = None, linkage_method: hapclust_params.linkage_method = hapclust_params.linkage_method_default, + distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default, count_sort: Optional[tree_params.count_sort] = None, distance_sort: Optional[tree_params.distance_sort] = None, title: plotly_params.title = True, @@ -90,6 +91,7 @@ def plot_haplotype_clustering( dist, phased_samples, n_snps_used = self.haplotype_pairwise_distances( region=region, analysis=analysis, + distance_metric=distance_metric, sample_sets=sample_sets, sample_query=sample_query, sample_query_options=sample_query_options, @@ -168,7 +170,7 @@ def plot_haplotype_clustering( leaf_color_discrete_map=color_discrete_map_prepped, leaf_category_orders=category_orders_prepped, template="simple_white", - y_axis_title="Distance (no. SNPs)", + y_axis_title=f"Distance ({distance_metric})", y_axis_buffer=1, ) @@ -205,6 +207,7 @@ def plot_haplotype_clustering( def haplotype_pairwise_distances( self, region: base_params.regions, + distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, sample_query: Optional[base_params.sample_query] = None, @@ -223,6 +226,7 @@ def haplotype_pairwise_distances( region_prepped = self._prep_region_cache_param(region=region) params = dict( region=region_prepped, + distance_metric=distance_metric, analysis=analysis, sample_sets=sample_sets_prepped, sample_query=sample_query, @@ -252,6 +256,7 @@ def _haplotype_pairwise_distances( self, *, region, + distance_metric, analysis, sample_sets, sample_query, @@ -295,10 +300,27 @@ def _haplotype_pairwise_distances( # to allow these to be saved to the results cache. phased_samples = ds_haps["sample_id"].values.astype("U") + # Number of sites + n_total_sites = region["end"] - region["start"] + 1 + + # Adjust distances if dxy requested + if distance_metric == "dxy": + # Normalize by total sites (common definition of dxy) + dist = dist / n_total_sites + elif distance_metric == "hamming": + # Leave as raw SNP differences + pass + else: + raise ValueError( + f"Unsupported distance_metric: {distance_metric}. " + "Choose from {'hamming', 'dxy'}." + ) + return dict( dist=dist, phased_samples=phased_samples, - n_snps=np.array(ht.shape[0]), + n_snps=np.array(n_total_sites), + n_seg_sites=np.array(ht.shape[0]), ) def plot_haplotype_clustering_advanced( @@ -314,8 +336,10 @@ def plot_haplotype_clustering_advanced( sample_query_options=None, random_seed=42, cohort_size=None, - cut_height=None, + distance_metric="hamming", + cluster_threshold=None, min_cluster_size=None, + cluster_criterion="distance", color=None, symbol=None, linkage_method="complete", @@ -358,6 +382,7 @@ def plot_haplotype_clustering_advanced( count_sort=count_sort, cohort_size=cohort_size, distance_sort=distance_sort, + distance_metric=distance_metric, linkage_method=linkage_method, color=color, symbol=symbol, @@ -389,23 +414,27 @@ def plot_haplotype_clustering_advanced( figures = [fig_dendro] subplot_heights = [dendrogram_height] - if cut_height and min_cluster_size: - cluster_col_list = px.colors.sequential.Turbo.copy() - cluster_col_list.insert(0, "white") - + if cluster_threshold and min_cluster_size: df_clusters = self.cut_dist_tree( dist=res["dist"], dist_samples=_make_unique(np.repeat(res["dist_samples"], 2)), dendro_sample_id_order=dendro_sample_id_order, linkage_method=linkage_method, - cut_height=cut_height, + cluster_threshold=cluster_threshold, min_cluster_size=min_cluster_size, - count_sort=count_sort, - distance_sort=distance_sort, + cluster_criterion=cluster_criterion, ) leaf_data = leaf_data.merge(df_clusters.T.reset_index()) + # if more than 8 clusters, use px.colors.qualitative.Alphabet + if df_clusters.max().max() > 8: + cluster_col_list = px.colors.qualitative.Alphabet.copy() + cluster_col_list.insert(0, "white") + else: + cluster_col_list = px.colors.qualitative.Dark2.copy() + cluster_col_list.insert(0, "white") + snp_trace = go.Heatmap( z=df_clusters.values, y=df_clusters.index.to_list(), @@ -455,7 +484,7 @@ def plot_haplotype_clustering_advanced( legend=dict(itemsizing=legend_sizing, tracegroupgap=0), ) - if snp_transcript: + if snp_transcript and n_aa > 0: # add lines to aa plot aa_idx = len(figures) fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1) @@ -502,9 +531,13 @@ def transcript_haplotypes( """ # Get SNP genotype allele counts for the transcript, applying snp_query - df_eff = self.snp_effects( - transcript=transcript, - ).query(snp_query) + df_eff = ( + self.snp_effects( + transcript=transcript, + ) + .query(snp_query) + .reset_index(drop=True) + ) df_eff["label"] = pandas_apply( _make_snp_label_effect, @@ -611,10 +644,9 @@ def cut_dist_tree( dist_samples, dendro_sample_id_order, linkage_method, - cut_height, + cluster_threshold, + cluster_criterion, min_cluster_size, - count_sort, - distance_sort, ): """ Create a one-row DataFrame with haplotype_ids as columns and cluster assignments as values @@ -627,14 +659,17 @@ def cut_dist_tree( List/array of individual identifiers (haplotype_ids) linkage_method : str Method used to calculate the linkage matrix - cut_height : float + cluster_threshold : float Height at which to cut the dendrogram min_cluster_size : int, default=1 Minimum number of individuals required in a cluster to be included - count_sort : bool - Whether to sort clusters by count - distance_sort : bool - Whether to sort clusters by distance + dendro_sample_id_order : array-like + List/array of individual identifiers (haplotype_ids) in the order they appear in + the dendrogram + cluster_criterion : str, default='distance' + The cluster_criterion to use in forming flat clusters. One of + 'inconsistent', 'distance', 'maxclust', 'maxclust_monochronic', 'monocrit' + See scipy.cluster.hierarchy.fcluster for details. Returns: -------- @@ -646,7 +681,9 @@ def cut_dist_tree( Z = linkage(dist, method=linkage_method) # Get cluster assignments for each individual - cluster_assignments = fcluster(Z, cut_height, criterion="distance") + cluster_assignments = fcluster( + Z, t=cluster_threshold, criterion=cluster_criterion + ) # Create initial DataFrame df = ( diff --git a/malariagen_data/anoph/hapclust_params.py b/malariagen_data/anoph/hapclust_params.py index 4f53fc823..08bc67351 100644 --- a/malariagen_data/anoph/hapclust_params.py +++ b/malariagen_data/anoph/hapclust_params.py @@ -1,5 +1,17 @@ """Parameters for haplotype clustering functions.""" from .clustering_params import linkage_method +from typing_extensions import Annotated, TypeAlias, Literal linkage_method_default: linkage_method = "single" + +distance_metric: TypeAlias = Annotated[ + Literal["hamming", "dxy"], + """ + The distance metric to use for calculating pairwise distances between haplotypes. + 'hamming' computes the Hamming distance (number of differing SNPs) between haplotypes. + 'dxy' computes the average number of nucleotide differences per site between haplotypes. + """, +] + +distance_metric_default: str = "hamming" diff --git a/notebooks/plot_haplotype_clustering.ipynb b/notebooks/plot_haplotype_clustering.ipynb index fd344a91e..f035681b9 100644 --- a/notebooks/plot_haplotype_clustering.ipynb +++ b/notebooks/plot_haplotype_clustering.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d3f73666", "metadata": {}, "outputs": [], @@ -21,109 +21,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "38abbb28", "metadata": {}, - "outputs": [ - { - "data": { - "application/javascript": "'use strict';\n(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded(error = null) {\n const el = document.getElementById(null);\n if (el != null) {\n const html = (() => {\n if (typeof root.Bokeh === \"undefined\") {\n if (error == null) {\n return \"BokehJS is loading ...\";\n } else {\n return \"BokehJS failed to load.\";\n }\n } else {\n const prefix = `BokehJS ${root.Bokeh.version}`;\n if (error == null) {\n return `${prefix} successfully loaded.`;\n } else {\n return `${prefix} encountered errors while loading and may not function as expected.`;\n }\n }\n })();\n el.innerHTML = html;\n\n if (error != null) {\n const wrapper = document.createElement(\"div\");\n wrapper.style.overflow = \"auto\";\n wrapper.style.height = \"5em\";\n wrapper.style.resize = \"vertical\";\n const content = document.createElement(\"div\");\n content.style.fontFamily = \"monospace\";\n content.style.whiteSpace = \"pre-wrap\";\n content.style.backgroundColor = \"rgb(255, 221, 221)\";\n content.textContent = error.stack ?? error.toString();\n wrapper.append(content);\n el.append(wrapper);\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(() => display_loaded(error), 100);\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.6.3.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n try {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n\n } catch (error) {throw error;\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", - "application/vnd.bokehjs_load.v0+json": "" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MalariaGEN Ag3 API client
\n", - " Please note that data are subject to terms of use,\n", - " for more information see \n", - " the MalariaGEN website or contact support@malariagen.net.\n", - " See also the Ag3 API docs.\n", - "
\n", - " Storage URL\n", - " simplecache::gs://vo_agam_release_master_us_central1
\n", - " Data releases available\n", - " 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16
\n", - " Results cache\n", - " /home/sanj/Software/malariagen-data-python/notebooks/results_cache
\n", - " Cohorts analysis\n", - " 20250502
\n", - " AIM analysis\n", - " 20220528
\n", - " Site filters analysis\n", - " dt_20200416
\n", - " Software version\n", - " malariagen_data 15.4.0.post6+f591dba1
\n", - " Client location\n", - " England, United Kingdom
\n", - " " - ], - "text/plain": [ - "\n", - "Storage URL : simplecache::gs://vo_agam_release_master_us_central1\n", - "Data releases available : 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16\n", - "Results cache : /home/sanj/Software/malariagen-data-python/notebooks/results_cache\n", - "Cohorts analysis : 20250502\n", - "AIM analysis : 20220528\n", - "Site filters analysis : dt_20200416\n", - "Software version : malariagen_data 15.4.0.post6+f591dba1\n", - "Client location : England, United Kingdom\n", - "---\n", - "Please note that data are subject to terms of use,\n", - "for more information see https://www.malariagen.net/data\n", - "or contact support@malariagen.net. For API documentation see \n", - "https://malariagen.github.io/malariagen-data-python/v15.4.0.post6+f591dba1/Ag3.html" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ag3 = malariagen_data.Ag3(\n", " \"simplecache::gs://vo_agam_release_master_us_central1\",\n", @@ -413,7 +314,7 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.plot_haplotype_clustering(\n", + "ag3.plot_haplotype_clustering_advanced(\n", " region=\"2R:28,480,000-28,490,000\",\n", " sample_sets=[\"3.0\"],\n", " sample_query=\"taxon == 'arabiensis'\",\n", @@ -421,88 +322,32 @@ " symbol=new_cohorts,\n", " color=\"year\",\n", " cohort_size=None,\n", - " width=1000,\n", - " height=400,\n", + " snp_transcript=\"AGAP002863-RA\",\n", + " distance_metric='hamming',\n", + " snp_filter_min_maf=0.05\n", ")" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "51846e18", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4daea74540a74823be0621dcc0afd499", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Load haplotypes: 0%| | 0/67 [00:00= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n", - " warnings.warn(problem)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "ename": "AttributeError", - "evalue": "'DataFrame' object has no attribute 'label'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", - "\u001b[32m/tmp/ipykernel_537773/530656911.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m ag3.plot_haplotype_clustering_advanced(\n\u001b[32m 2\u001b[39m region=\u001b[33m\"2R:28,480,000-28,490,000\"\u001b[39m,\n\u001b[32m 3\u001b[39m sample_sets=[\u001b[33m\"3.0\"\u001b[39m],\n\u001b[32m 4\u001b[39m sample_query=\u001b[33m\"taxon == 'arabiensis'\"\u001b[39m,\n", - "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, region, analysis, snp_transcript, snp_colorscale, snp_filter_min_maf, snp_query, sample_sets, sample_query, sample_query_options, random_seed, cohort_size, cut_height, min_cluster_size, color, symbol, linkage_method, count_sort, distance_sort, title, title_font_size, width, dendrogram_height, snp_row_height, show, renderer, render_mode, leaf_y, marker_size, line_width, line_color, color_discrete_sequence, color_discrete_map, category_orders, legend_sizing, chunks, inline_array)\u001b[39m\n\u001b[32m 415\u001b[39m )\n\u001b[32m 416\u001b[39m figures.append(snp_trace)\n\u001b[32m 417\u001b[39m subplot_heights.append(\u001b[32m25\u001b[39m)\n\u001b[32m 418\u001b[39m \n\u001b[32m--> \u001b[39m\u001b[32m419\u001b[39m figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace(\n\u001b[32m 420\u001b[39m transcript=snp_transcript,\n\u001b[32m 421\u001b[39m snp_query=snp_query,\n\u001b[32m 422\u001b[39m figures=figures,\n", - "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, figures, subplot_heights, transcript, analysis, dendro_sample_id_order, snp_filter_min_maf, snp_colorscale, snp_query, snp_row_height, sample_sets, sample_query, chunks, inline_array)\u001b[39m\n\u001b[32m 561\u001b[39m ):\n\u001b[32m 562\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m plotly \u001b[38;5;28;01mimport\u001b[39;00m graph_objects \u001b[38;5;28;01mas\u001b[39;00m go\n\u001b[32m 563\u001b[39m \n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# load genotype allele counts at SNP variants for each sample\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m565\u001b[39m df_haps = self.transcript_haplotypes(\n\u001b[32m 566\u001b[39m transcript=transcript,\n\u001b[32m 567\u001b[39m snp_query=snp_query,\n\u001b[32m 568\u001b[39m sample_query=sample_query,\n", - "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, transcript, sample_sets, sample_query, analysis, snp_query, chunks, inline_array)\u001b[39m\n\u001b[32m 529\u001b[39m sample_ids = ds_haps[\u001b[33m\"sample_id\"\u001b[39m].values\n\u001b[32m 530\u001b[39m \n\u001b[32m 531\u001b[39m \u001b[38;5;66;03m# Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff\u001b[39;00m\n\u001b[32m 532\u001b[39m df_eff = df_eff.query(\u001b[33m\"pos_alt in @h_pos_alts\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m533\u001b[39m label = df_eff.label.values\n\u001b[32m 534\u001b[39m haps_bool = np.isin(h_pos_alts, df_eff.pos_alt)\n\u001b[32m 535\u001b[39m haps = haps.compress(haps_bool)\n\u001b[32m 536\u001b[39m \n", - "\u001b[32m~/apps/miniforge3/lib/python3.12/site-packages/pandas/core/generic.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 6295\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m name \u001b[38;5;28;01mnot\u001b[39;00m \u001b[38;5;28;01min\u001b[39;00m self._accessors\n\u001b[32m 6296\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m self._info_axis._can_hold_identifiers_and_holds_name(name)\n\u001b[32m 6297\u001b[39m ):\n\u001b[32m 6298\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m self[name]\n\u001b[32m-> \u001b[39m\u001b[32m6299\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m object.__getattribute__(self, name)\n", - "\u001b[31mAttributeError\u001b[39m: 'DataFrame' object has no attribute 'label'" - ] - } - ], + "outputs": [], "source": [ "ag3.plot_haplotype_clustering_advanced(\n", - " region=\"2R:28,480,000-28,490,000\",\n", - " sample_sets=[\"3.0\"],\n", - " sample_query=\"taxon == 'arabiensis'\",\n", - " analysis=\"gamb_colu_arab\",\n", - " # symbol=new_cohorts,\n", - " color=\"year\",\n", + " region=\"2L:2,360,000-2,431,000\",\n", + " sample_sets=[\"AG1000G-GH\"],\n", + " analysis=\"gamb_colu\",\n", + " color=\"taxon\",\n", " cohort_size=None,\n", - " snp_transcript=\"AGAP002862-RA\",\n", - " # width=1000,\n", - " # height=400,\n", + " snp_transcript=\"AGAP004707-RD\",\n", + " distance_metric='dxy',\n", + " cluster_criterion='distance',\n", + " cluster_threshold=0.0007,\n", + " min_cluster_size=5,\n", + " snp_filter_min_maf=0.05,\n", + " show=True\n", ")" ] }, @@ -600,7 +445,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, From e7adb65e222575ab7b9ac1279132fcc1a7124ca0 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Sun, 7 Sep 2025 16:34:25 +0100 Subject: [PATCH 05/15] mypy fix --- malariagen_data/anoph/hapclust_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anoph/hapclust_params.py b/malariagen_data/anoph/hapclust_params.py index 08bc67351..71c5c2b4f 100644 --- a/malariagen_data/anoph/hapclust_params.py +++ b/malariagen_data/anoph/hapclust_params.py @@ -14,4 +14,4 @@ """, ] -distance_metric_default: str = "hamming" +distance_metric_default: Literal["hamming", "dxy"] = "hamming" From 9741b730e00e03dbc213ef37d3cf74f9caa3620b Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Sun, 7 Sep 2025 16:52:05 +0100 Subject: [PATCH 06/15] tests fix hapclust output type --- malariagen_data/anoph/hapclust.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index 1a6532ee6..c90268c01 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -68,7 +68,7 @@ def plot_haplotype_clustering( legend_sizing: plotly_params.legend_sizing = "constant", chunks: base_params.chunks = base_params.native_chunks, inline_array: base_params.inline_array = base_params.inline_array_default, - ) -> plotly_params.figure: + ) -> Optional[dict]: import sys # Normalise params. From 50a8c228f60b4eb2281f3fd880d330faef2ef75e Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Sep 2025 10:33:13 +0100 Subject: [PATCH 07/15] add docstring and typing --- malariagen_data/anoph/dipclust.py | 2 + malariagen_data/anoph/hapclust.py | 92 ++++++++++++++++++------------- 2 files changed, 56 insertions(+), 38 deletions(-) diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py index 043019ec4..82d99b758 100644 --- a/malariagen_data/anoph/dipclust.py +++ b/malariagen_data/anoph/dipclust.py @@ -742,6 +742,7 @@ def plot_diplotype_clustering_advanced( dendro_sample_id_order=dendro_sample_id_order, snp_filter_min_maf=snp_filter_min_maf, snp_colorscale=snp_colorscale, + snp_row_height=snp_row_height, chunks=chunks, inline_array=inline_array, ) @@ -759,6 +760,7 @@ def plot_diplotype_clustering_advanced( dendro_sample_id_order=dendro_sample_id_order, snp_filter_min_maf=snp_filter_min_maf, snp_colorscale=snp_colorscale, + snp_row_height=snp_row_height, chunks=chunks, inline_array=inline_array, ) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index c90268c01..5000cd60e 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -14,6 +14,7 @@ hap_params, clustering_params, hapclust_params, + dipclust_params, ) from .snp_data import AnophelesSnpData from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect @@ -323,46 +324,60 @@ def _haplotype_pairwise_distances( n_seg_sites=np.array(ht.shape[0]), ) + @check_types + @doc( + summary=""" + Hierarchically cluster haplotypes in region, and produce an interactive plot + with optional SNP haplotype heatmap and/or cluster assignments. + """, + returns=""" + If `show` is False, returns a tuple (fig, leaf_data, df_haps) where + `fig` is a plotly Figure object, `leaf_data` is a DataFrame with + metadata for each haplotype in the dendrogram, and `df_haps` is a DataFrame + of haplotype calls for each sample at each SNP in the specified transcript. + If `show` is True, displays the figure and returns None. + """, + ) def plot_haplotype_clustering_advanced( self, - region, - analysis, - snp_transcript=None, - snp_colorscale="Greys", - snp_filter_min_maf=0.05, + region: base_params.regions, + analysis: hap_params.analysis = base_params.DEFAULT, + snp_transcript: dipclust_params.snp_transcript = None, + snp_colorscale: dipclust_params.snp_colorscale = "Greys", + snp_filter_min_maf: dipclust_params.snp_filter_min_maf = 0.05, snp_query=AA_CHANGE_QUERY, - sample_sets=None, - sample_query=None, - sample_query_options=None, - random_seed=42, - cohort_size=None, - distance_metric="hamming", - cluster_threshold=None, - min_cluster_size=None, + sample_sets: Optional[base_params.sample_sets] = None, + sample_query: Optional[base_params.sample_query] = None, + sample_query_options: Optional[base_params.sample_query_options] = None, + random_seed: base_params.random_seed = 42, + cohort_size: Optional[base_params.cohort_size] = None, + distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default, + cluster_threshold: Optional[float] = None, + min_cluster_size: Optional[int] = None, cluster_criterion="distance", - color=None, - symbol=None, - linkage_method="complete", - count_sort=None, - distance_sort=None, - title=True, - title_font_size=14, - width=None, - dendrogram_height=300, - snp_row_height=25, - show=True, - renderer=None, - render_mode="svg", - leaf_y=0, - marker_size=5, - line_width=0.5, - line_color="black", - color_discrete_sequence=None, - color_discrete_map=None, - category_orders=None, - legend_sizing="constant", - chunks="native", - inline_array=True, + color: plotly_params.color = None, + symbol: plotly_params.symbol = None, + linkage_method: dipclust_params.linkage_method = "complete", + count_sort: Optional[tree_params.count_sort] = None, + distance_sort: Optional[tree_params.distance_sort] = None, + title: plotly_params.title = True, + title_font_size: plotly_params.title_font_size = 14, + width: plotly_params.fig_width = None, + dendrogram_height: plotly_params.height = 300, + snp_row_height: plotly_params.height = 25, + show: plotly_params.show = True, + renderer: plotly_params.renderer = None, + render_mode: plotly_params.render_mode = "svg", + leaf_y: plotly_params.leaf_y = 0, + marker_size: plotly_params.marker_size = 5, + line_width: plotly_params.line_width = 0.5, + line_color: plotly_params.line_color = "black", + color_discrete_sequence: plotly_params.color_discrete_sequence = None, + color_discrete_map: plotly_params.color_discrete_map = None, + category_orders: plotly_params.category_order = None, + legend_sizing: plotly_params.legend_sizing = "constant", + chunks: base_params.chunks = base_params.native_chunks, + inline_array: base_params.inline_array = base_params.inline_array_default, ): import plotly.express as px import plotly.graph_objects as go @@ -565,7 +580,6 @@ def transcript_haplotypes( h_pos = allel.SortedIndex(ds_haps["variant_position"].values) h_alts = ds_haps["variant_allele"].values.astype(str)[:, 1] h_pos_alts = np.array([f"{pos}-{h_alts[i]}" for i, pos in enumerate(h_pos)]) - sample_ids = ds_haps["sample_id"].values # Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff df_eff = df_eff.query("pos_alt in @h_pos_alts") @@ -577,7 +591,9 @@ def transcript_haplotypes( df_haps = pd.DataFrame( haps, index=label, - columns=_make_unique(np.repeat(sample_ids, 2)), # two haplotypes per sample + columns=_make_unique( + np.repeat(ds_haps["sample_id"].values, 2) + ), # two haplotypes per sample ) return df_haps From 69359669da3a4e59fe628c3b71aeb7ddb18d515c Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Sep 2025 10:35:58 +0100 Subject: [PATCH 08/15] fix snp_colorscale type --- malariagen_data/anoph/hapclust.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index 5000cd60e..8f69d870e 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -343,7 +343,7 @@ def plot_haplotype_clustering_advanced( region: base_params.regions, analysis: hap_params.analysis = base_params.DEFAULT, snp_transcript: dipclust_params.snp_transcript = None, - snp_colorscale: dipclust_params.snp_colorscale = "Greys", + snp_colorscale: plotly_params.snp_colorscale = "Greys", snp_filter_min_maf: dipclust_params.snp_filter_min_maf = 0.05, snp_query=AA_CHANGE_QUERY, sample_sets: Optional[base_params.sample_sets] = None, From 14b58462be2f8f77b9216a80ed0a7c8d65e695bb Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Sep 2025 10:42:28 +0100 Subject: [PATCH 09/15] fix errors --- malariagen_data/anoph/hapclust.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index 8f69d870e..a172bba62 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -19,9 +19,12 @@ from .snp_data import AnophelesSnpData from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect from .hap_data import AnophelesHapData +from .dipclust import AnophelesDipClustAnalysis -class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData): +class AnophelesHapClustAnalysis( + AnophelesHapData, AnophelesSnpData, AnophelesDipClustAnalysis +): def __init__( self, **kwargs, @@ -342,9 +345,9 @@ def plot_haplotype_clustering_advanced( self, region: base_params.regions, analysis: hap_params.analysis = base_params.DEFAULT, - snp_transcript: dipclust_params.snp_transcript = None, - snp_colorscale: plotly_params.snp_colorscale = "Greys", - snp_filter_min_maf: dipclust_params.snp_filter_min_maf = 0.05, + snp_transcript: Optional[dipclust_params.snp_transcript] = None, + snp_colorscale: Optional[plotly_params.color_continuous_scale] = "Greys", + snp_filter_min_maf: float = 0.05, snp_query=AA_CHANGE_QUERY, sample_sets: Optional[base_params.sample_sets] = None, sample_query: Optional[base_params.sample_query] = None, @@ -368,7 +371,7 @@ def plot_haplotype_clustering_advanced( show: plotly_params.show = True, renderer: plotly_params.renderer = None, render_mode: plotly_params.render_mode = "svg", - leaf_y: plotly_params.leaf_y = 0, + leaf_y: clustering_params.leaf_y = 0, marker_size: plotly_params.marker_size = 5, line_width: plotly_params.line_width = 0.5, line_color: plotly_params.line_color = "black", From a878727c8a2fae4e47c83b54d5437d19a390f824 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Sep 2025 10:47:48 +0100 Subject: [PATCH 10/15] fix doc --- malariagen_data/anoph/hapclust.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index a172bba62..7f0c8c9ce 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -340,6 +340,10 @@ def _haplotype_pairwise_distances( of haplotype calls for each sample at each SNP in the specified transcript. If `show` is True, displays the figure and returns None. """, + parameters=dict( + snp_transcript="Plot amino acid variants for these transcripts.", + snp_filter_min_maf="Filter amino acid variants with alternate allele frequency below this threshold.", + ), ) def plot_haplotype_clustering_advanced( self, From 36f6d5156fd4941bd59b03ce455599de8f36464b Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Sep 2025 11:02:24 +0100 Subject: [PATCH 11/15] move concat_subplots outside of dipclust.py to plotly_dendrogram.py --- malariagen_data/anoph/dipclust.py | 54 ++-------------------------- malariagen_data/anoph/hapclust.py | 15 ++++---- malariagen_data/plotly_dendrogram.py | 51 ++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 59 deletions(-) diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py index 82d99b758..bcb14b6bf 100644 --- a/malariagen_data/anoph/dipclust.py +++ b/malariagen_data/anoph/dipclust.py @@ -13,7 +13,7 @@ multiallelic_diplotype_mean_sqeuclidean, multiallelic_diplotype_mean_cityblock, ) -from ..plotly_dendrogram import plot_dendrogram +from ..plotly_dendrogram import plot_dendrogram, concat_clustering_subplots from . import ( base_params, plotly_params, @@ -499,56 +499,6 @@ def _dipclust_snp_trace( return snp_trace, n_snps - def _dipclust_concat_subplots( - self, - figures, - width, - height, - row_heights, - region: base_params.regions, - n_snps: int, - sample_sets: Optional[base_params.sample_sets], - sample_query: Optional[base_params.sample_query], - ): - from plotly.subplots import make_subplots # type: ignore - - title_lines = [] - if sample_sets is not None: - title_lines.append(f"sample sets: {sample_sets}") - if sample_query is not None: - title_lines.append(f"sample query: {sample_query}") - title_lines.append(f"genomic region: {region} ({n_snps} SNPs)") - title = "
".join(title_lines) - - # make subplots - fig = make_subplots( - rows=len(figures), - cols=1, - shared_xaxes=True, - vertical_spacing=0.02, - row_heights=row_heights, - ) - - for i, figure in enumerate(figures): - if isinstance(figure, go.Figure): - # This is a figure, access the traces within it. - for trace in range(len(figure["data"])): - fig.append_trace(figure["data"][trace], row=i + 1, col=1) - else: - # Assume this is a trace, add directly. - fig.append_trace(figure, row=i + 1, col=1) - - fig.update_xaxes(visible=False) - fig.update_layout( - title=title, - width=width, - height=height, - hovermode="closest", - plot_bgcolor="white", - ) - - return fig - def _insert_dipclust_snp_trace( self, *, @@ -768,7 +718,7 @@ def plot_diplotype_clustering_advanced( # Calculate total height based on subplot heights, plus a fixed # additional component to allow for title, axes etc. height = sum(subplot_heights) + 50 - fig = self._dipclust_concat_subplots( + fig = concat_clustering_subplots( figures=figures, width=width, height=height, diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index 7f0c8c9ce..c978eb18e 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -6,7 +6,7 @@ from numpydoc_decorator import doc # type: ignore from ..util import CacheMiss, check_types, pdist_abs_hamming, pandas_apply -from ..plotly_dendrogram import plot_dendrogram +from ..plotly_dendrogram import plot_dendrogram, concat_clustering_subplots from . import ( base_params, plotly_params, @@ -19,12 +19,9 @@ from .snp_data import AnophelesSnpData from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect from .hap_data import AnophelesHapData -from .dipclust import AnophelesDipClustAnalysis -class AnophelesHapClustAnalysis( - AnophelesHapData, AnophelesSnpData, AnophelesDipClustAnalysis -): +class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData): def __init__( self, **kwargs, @@ -343,6 +340,10 @@ def _haplotype_pairwise_distances( parameters=dict( snp_transcript="Plot amino acid variants for these transcripts.", snp_filter_min_maf="Filter amino acid variants with alternate allele frequency below this threshold.", + snp_query="Query to filter SNPs for amino acid heatmap. Default is to include all non-synonymous SNPs.", + cluster_threshold="Height at which to cut the dendrogram to form clusters. If not provided, no clusters assignment is not performed.", + min_cluster_size="Minimum number of haplotypes required in a cluster to be included when cutting the dendrogram. Default is 5.", + cluster_criterion="The cluster_criterion to use in forming flat clusters. One of 'inconsistent', 'distance', 'maxclust', 'maxclust_monochronic', 'monocrit'. See scipy.cluster.hierarchy.fcluster for details.", ), ) def plot_haplotype_clustering_advanced( @@ -360,7 +361,7 @@ def plot_haplotype_clustering_advanced( cohort_size: Optional[base_params.cohort_size] = None, distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default, cluster_threshold: Optional[float] = None, - min_cluster_size: Optional[int] = None, + min_cluster_size: Optional[int] = 5, cluster_criterion="distance", color: plotly_params.color = None, symbol: plotly_params.symbol = None, @@ -487,7 +488,7 @@ def plot_haplotype_clustering_advanced( # Calculate total height based on subplot heights, plus a fixed # additional component to allow for title, axes etc. height = sum(subplot_heights) + 50 - fig = self._dipclust_concat_subplots( + fig = concat_clustering_subplots( figures=figures, width=width, height=height, diff --git a/malariagen_data/plotly_dendrogram.py b/malariagen_data/plotly_dendrogram.py index a172c469b..8a94907d1 100644 --- a/malariagen_data/plotly_dendrogram.py +++ b/malariagen_data/plotly_dendrogram.py @@ -130,3 +130,54 @@ def plot_dendrogram( ) return fig, leaf_data + + +def concat_clustering_subplots( + figures, + width, + height, + row_heights, + region, + n_snps, + sample_sets, + sample_query, +): + from plotly.subplots import make_subplots # type: ignore + import plotly.graph_objects as go # type: ignore + + title_lines = [] + if sample_sets is not None: + title_lines.append(f"sample sets: {sample_sets}") + if sample_query is not None: + title_lines.append(f"sample query: {sample_query}") + title_lines.append(f"genomic region: {region} ({n_snps} SNPs)") + title = "
".join(title_lines) + + # make subplots + fig = make_subplots( + rows=len(figures), + cols=1, + shared_xaxes=True, + vertical_spacing=0.02, + row_heights=row_heights, + ) + + for i, figure in enumerate(figures): + if isinstance(figure, go.Figure): + # This is a figure, access the traces within it. + for trace in range(len(figure["data"])): + fig.append_trace(figure["data"][trace], row=i + 1, col=1) + else: + # Assume this is a trace, add directly. + fig.append_trace(figure, row=i + 1, col=1) + + fig.update_xaxes(visible=False) + fig.update_layout( + title=title, + width=width, + height=height, + hovermode="closest", + plot_bgcolor="white", + ) + + return fig From 5a273d0c97459c2a638b51596d44ce33ad791b5e Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Sep 2025 14:31:35 +0100 Subject: [PATCH 12/15] add support for multiple transcripts, lines --- malariagen_data/anoph/dipclust.py | 55 ++++++++++++- malariagen_data/anoph/hapclust.py | 126 ++++++++++++++++++++---------- 2 files changed, 136 insertions(+), 45 deletions(-) diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py index bcb14b6bf..825c92861 100644 --- a/malariagen_data/anoph/dipclust.py +++ b/malariagen_data/anoph/dipclust.py @@ -538,7 +538,7 @@ def _insert_dipclust_snp_trace( print( f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot." ) - return figures, subplot_heights + return figures, subplot_heights, n_snps_transcript @doc( summary="""" @@ -679,8 +679,13 @@ def plot_diplotype_clustering_advanced( figures.append(cnv_trace) subplot_heights.append(cnv_row_height * n_cnv_genes) + n_snps_transcripts = [] if isinstance(snp_transcript, str): - figures, subplot_heights = self._insert_dipclust_snp_trace( + ( + figures, + subplot_heights, + n_snps_transcript, + ) = self._insert_dipclust_snp_trace( transcript=snp_transcript, figures=figures, subplot_heights=subplot_heights, @@ -696,9 +701,14 @@ def plot_diplotype_clustering_advanced( chunks=chunks, inline_array=inline_array, ) + n_snps_transcripts.append(n_snps_transcript) elif isinstance(snp_transcript, list): for st in snp_transcript: - figures, subplot_heights = self._insert_dipclust_snp_trace( + ( + figures, + subplot_heights, + n_snps_transcript, + ) = self._insert_dipclust_snp_trace( transcript=st, figures=figures, subplot_heights=subplot_heights, @@ -714,6 +724,7 @@ def plot_diplotype_clustering_advanced( chunks=chunks, inline_array=inline_array, ) + n_snps_transcripts.append(n_snps_transcript) # Calculate total height based on subplot heights, plus a fixed # additional component to allow for title, axes etc. @@ -739,6 +750,44 @@ def plot_diplotype_clustering_advanced( legend=dict(itemsizing=legend_sizing, tracegroupgap=0), ) + # add lines to aa plot - looks neater + if snp_transcript: + n_transcripts = ( + len(snp_transcript) if isinstance(snp_transcript, list) else 1 + ) + for i in range(n_transcripts): + tx_idx = len(figures) - n_transcripts + i + 1 + if n_snps_transcripts[i] > 0: + fig.add_hline( + y=-0.5, line_width=1, line_color="grey", row=tx_idx, col=1 + ) + for j in range(n_snps_transcripts[i]): + fig.add_hline( + y=j + 0.5, + line_width=1, + line_color="grey", + row=tx_idx, + col=1, + ) + + fig.update_xaxes( + showline=True, + linecolor="grey", + linewidth=1, + row=tx_idx, + col=1, + mirror=True, + ) + + fig.update_yaxes( + showline=True, + linecolor="grey", + linewidth=1, + row=tx_idx, + col=1, + mirror=True, + ) + if show: fig.show(renderer=renderer) return None diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index c978eb18e..cf9f0ef1b 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -468,22 +468,50 @@ def plot_haplotype_clustering_advanced( figures.append(snp_trace) subplot_heights.append(25) - figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace( - transcript=snp_transcript, - snp_query=snp_query, - figures=figures, - subplot_heights=subplot_heights, - sample_sets=sample_sets, - sample_query=sample_query, - analysis=analysis, - dendro_sample_id_order=dendro_sample_id_order, - snp_filter_min_maf=snp_filter_min_maf, - snp_colorscale=snp_colorscale, - snp_row_height=snp_row_height, - chunks=chunks, - inline_array=inline_array, - ) - n_aa = df_haps.shape[0] + n_snps_transcripts = [] + if isinstance(snp_transcript, str): + ( + figures, + subplot_heights, + n_snps_transcript, + ) = self._insert_hapclust_snp_trace( + transcript=snp_transcript, + snp_query=snp_query, + figures=figures, + subplot_heights=subplot_heights, + sample_sets=sample_sets, + sample_query=sample_query, + analysis=analysis, + dendro_sample_id_order=dendro_sample_id_order, + snp_filter_min_maf=snp_filter_min_maf, + snp_colorscale=snp_colorscale, + snp_row_height=snp_row_height, + chunks=chunks, + inline_array=inline_array, + ) + n_snps_transcripts.append(n_snps_transcript) + elif isinstance(snp_transcript, list): + for st in snp_transcript: + ( + figures, + subplot_heights, + n_snps_transcript, + ) = self._insert_hapclust_snp_trace( + transcript=st, + snp_query=snp_query, + figures=figures, + subplot_heights=subplot_heights, + sample_sets=sample_sets, + sample_query=sample_query, + analysis=analysis, + dendro_sample_id_order=dendro_sample_id_order, + snp_filter_min_maf=snp_filter_min_maf, + snp_colorscale=snp_colorscale, + snp_row_height=snp_row_height, + chunks=chunks, + inline_array=inline_array, + ) + n_snps_transcripts.append(n_snps_transcript) # Calculate total height based on subplot heights, plus a fixed # additional component to allow for title, axes etc. @@ -499,7 +527,7 @@ def plot_haplotype_clustering_advanced( n_snps=n_snps_cluster, ) - fig["layout"]["yaxis"]["title"] = "Distance" + fig["layout"]["yaxis"]["title"] = f"Distance ({distance_metric})" fig.update_layout( title_font=dict( size=title_font_size, @@ -507,37 +535,49 @@ def plot_haplotype_clustering_advanced( legend=dict(itemsizing=legend_sizing, tracegroupgap=0), ) - if snp_transcript and n_aa > 0: - # add lines to aa plot - aa_idx = len(figures) - fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1) - for i in range(n_aa): - fig.add_hline( - y=i + 0.5, line_width=1, line_color="grey", row=aa_idx, col=1 + # add lines to aa plot - looks neater + if snp_transcript: + n_transcripts = ( + len(snp_transcript) if isinstance(snp_transcript, list) else 1 + ) + for i in range(n_transcripts): + tx_idx = len(figures) - n_transcripts + i + 1 + if n_snps_transcripts[i] > 0: + fig.add_hline( + y=-0.5, line_width=1, line_color="grey", row=tx_idx, col=1 + ) + for j in range(n_snps_transcripts[i]): + fig.add_hline( + y=j + 0.5, + line_width=1, + line_color="grey", + row=tx_idx, + col=1, + ) + + fig.update_xaxes( + showline=True, + linecolor="grey", + linewidth=1, + row=tx_idx, + col=1, + mirror=True, ) - fig.update_xaxes( - showline=True, - linecolor="grey", - linewidth=1, - row=aa_idx, - col=1, - mirror=True, - ) - fig.update_yaxes( - showline=True, - linecolor="grey", - linewidth=1, - row=aa_idx, - col=1, - mirror=True, - ) + fig.update_yaxes( + showline=True, + linecolor="grey", + linewidth=1, + row=tx_idx, + col=1, + mirror=True, + ) if show: fig.show(renderer=renderer) return None else: - return fig, leaf_data, df_haps + return fig, leaf_data def transcript_haplotypes( self, @@ -642,6 +682,8 @@ def _insert_hapclust_snp_trace( df_haps = df_haps.assign(af=lambda x: x.sum(axis=1) / x.shape[1]) df_haps = df_haps.query("af > @snp_filter_min_maf").drop(columns="af") + n_snps_transcript = df_haps.shape[0] + if not df_haps.empty: snp_trace = go.Heatmap( z=df_haps.values, @@ -660,7 +702,7 @@ def _insert_hapclust_snp_trace( print( f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot." ) - return figures, subplot_heights, df_haps + return figures, subplot_heights, n_snps_transcript def cut_dist_tree( self, From 95923d667d0ee017d68174f40c99bf719b922c24 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Mon, 8 Sep 2025 15:06:32 +0100 Subject: [PATCH 13/15] rename cluster_id --- malariagen_data/anoph/hapclust.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index cf9f0ef1b..ccc4f389f 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -756,7 +756,7 @@ def cut_dist_tree( pd.DataFrame( { "sample_id": dist_samples, - "cluster_id": _filter_and_remap( + "Cluster ID": _filter_and_remap( cluster_assignments, x=min_cluster_size ), } From 7b969710573048de1b6b2582c51b10065c6796b2 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Fri, 19 Sep 2025 15:48:19 +0100 Subject: [PATCH 14/15] add anophFreqAnalysis class to hapclust --- malariagen_data/anoph/hapclust.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index ccc4f389f..571fd06b3 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -17,11 +17,11 @@ dipclust_params, ) from .snp_data import AnophelesSnpData -from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect +from .snp_frq import AnophelesFrequencyAnalysis, AA_CHANGE_QUERY, _make_snp_label_effect from .hap_data import AnophelesHapData -class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData): +class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData, AnophelesFrequencyAnalysis): def __init__( self, **kwargs, From 094bc065bf9c1623b8ce22b5d326ba5fd43af068 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Fri, 19 Sep 2025 15:51:05 +0100 Subject: [PATCH 15/15] pre-commit lints --- malariagen_data/anoph/hapclust.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py index 571fd06b3..792b33509 100644 --- a/malariagen_data/anoph/hapclust.py +++ b/malariagen_data/anoph/hapclust.py @@ -21,7 +21,9 @@ from .hap_data import AnophelesHapData -class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData, AnophelesFrequencyAnalysis): +class AnophelesHapClustAnalysis( + AnophelesHapData, AnophelesSnpData, AnophelesFrequencyAnalysis +): def __init__( self, **kwargs,