From 837b1de04157ffe023a75c7efb1b9abbfe7839ed Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Fri, 5 Sep 2025 22:05:03 +0100
Subject: [PATCH 01/15] add advanced haplotype clustering

---
 malariagen_data/anoph/dipclust.py |   6 +-
 malariagen_data/anoph/hapclust.py | 374 +++++++++++++++++++++++++++++-
 2 files changed, 372 insertions(+), 8 deletions(-)

diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py
index b96ee4283..043019ec4 100644
--- a/malariagen_data/anoph/dipclust.py
+++ b/malariagen_data/anoph/dipclust.py
@@ -22,13 +22,9 @@
     dipclust_params,
     cnv_params,
 )
-from .snp_frq import AnophelesSnpFrequencyAnalysis
+from .snp_frq import AnophelesSnpFrequencyAnalysis, AA_CHANGE_QUERY
 from .cnv_frq import AnophelesCnvFrequencyAnalysis
 
-AA_CHANGE_QUERY = (
-    "effect in ['NON_SYNONYMOUS_CODING', 'START_LOST', 'STOP_LOST', 'STOP_GAINED']"
-)
-
 
 class AnophelesDipClustAnalysis(
     AnophelesCnvFrequencyAnalysis, AnophelesSnpFrequencyAnalysis
diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 05eb6acb8..bb9d80369 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -16,6 +16,7 @@
     hapclust_params,
 )
 from .snp_data import AnophelesSnpData
+from .snp_frq import AA_CHANGE_QUERY
 from .hap_data import AnophelesHapData
 
 
@@ -127,6 +128,7 @@ def plot_haplotype_clustering(
         # Repeat the dataframe so there is one row of metadata for each haplotype.
         df_haps = pd.DataFrame(np.repeat(df_samples_phased.values, 2, axis=0))
         df_haps.columns = df_samples_phased.columns
+        leaf_data = df_haps.assign(sample_id=_make_unique(df_haps.sample_id))
 
         # Configure hover data.
         hover_data = self._setup_sample_hover_data_plotly(
@@ -145,7 +147,7 @@ def plot_haplotype_clustering(
 
         # Create the plot.
         with self._spinner("Plot dendrogram"):
-            fig, _ = plot_dendrogram(
+            fig, leaf_data = plot_dendrogram(
                 dist=dist,
                 linkage_method=linkage_method,
                 count_sort=count_sort,
@@ -157,7 +159,7 @@ def plot_haplotype_clustering(
                 line_width=line_width,
                 line_color=line_color,
                 marker_size=marker_size,
-                leaf_data=df_haps,
+                leaf_data=leaf_data,
                 leaf_hover_name="sample_id",
                 leaf_hover_data=hover_data,
                 leaf_color=color_prepped,
@@ -182,7 +184,13 @@ def plot_haplotype_clustering(
             fig.show(renderer=renderer)
             return None
         else:
-            return fig
+            return {
+                "figure": fig,
+                "n_snps": n_snps_used,
+                "dist": dist,
+                "dist_samples": phased_samples,
+                "leaf_data":leaf_data,
+            }
 
     @doc(
         summary="""
@@ -292,3 +300,363 @@ def _haplotype_pairwise_distances(
             phased_samples=phased_samples,
             n_snps=np.array(ht.shape[0]),
         )
+
+    def plot_haplotype_clustering_advanced(
+        self,
+        region,
+        analysis,
+        snp_transcript=None,
+        snp_colorscale="Greys",
+        snp_filter_min_maf=0.05,
+        snp_query=AA_CHANGE_QUERY,
+        sample_sets=None,
+        sample_query=None,
+        sample_query_options=None,
+        random_seed=42,
+        cohort_size=None,
+        cut_height=None,
+        min_cluster_size=None,
+        color=None,
+        symbol=None,
+        linkage_method="complete",
+        count_sort=None,
+        distance_sort=None,
+        title=True,
+        title_font_size=14,
+        width=None,
+        dendrogram_height=300,
+        snp_row_height=25,
+        show=True,
+        renderer=None,
+        render_mode="svg",
+        leaf_y=0,
+        marker_size=5,
+        line_width=0.5,
+        line_color="black",
+        color_discrete_sequence=None,
+        color_discrete_map=None,
+        category_orders=None,
+        legend_sizing="constant",
+        chunks="native",
+        inline_array=True,
+    ):
+        import plotly.express as px
+        import plotly.graph_objects as go
+
+        if cohort_size and snp_transcript:
+            cohort_size = None
+            print(
+                "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None."
+            )
+
+        res = self.plot_haplotype_clustering(
+            region=region,
+            analysis=analysis,
+            sample_sets=sample_sets,
+            sample_query=sample_query,
+            sample_query_options=sample_query_options,
+            count_sort=count_sort,
+            cohort_size=cohort_size,
+            distance_sort=distance_sort,
+            linkage_method=linkage_method,
+            color=color,
+            symbol=symbol,
+            title=title,
+            title_font_size=title_font_size,
+            width=width,
+            height=dendrogram_height,
+            show=False,
+            renderer=renderer,
+            render_mode=render_mode,
+            leaf_y=leaf_y,
+            marker_size=marker_size,
+            line_width=line_width,
+            line_color=line_color,
+            color_discrete_sequence=color_discrete_sequence,
+            color_discrete_map=color_discrete_map,
+            category_orders=category_orders,
+            legend_sizing=legend_sizing,
+            random_seed=random_seed,
+            chunks=chunks,
+            inline_array=inline_array,
+        )
+
+        fig_dendro = res["figure"]
+        n_snps_cluster = res["n_snps"]
+        leaf_data = res["leaf_data"]
+        dendro_sample_id_order = np.asarray(leaf_data["sample_id"].to_list())
+
+        figures = [fig_dendro]
+        subplot_heights = [dendrogram_height]
+        
+        if cut_height and min_cluster_size:
+            cluster_col_list = px.colors.sequential.Turbo.copy()
+            cluster_col_list.insert(0, 'white')
+            
+            df_clusters = self.cut_dist_tree(
+                dist=res['dist'], 
+                dist_samples=_make_unique(np.repeat(res['dist_samples'], 2)),
+                dendro_sample_id_order=dendro_sample_id_order,
+                linkage_method=linkage_method, 
+                cut_height=cut_height,
+                min_cluster_size=min_cluster_size, 
+                count_sort=count_sort, 
+                distance_sort=distance_sort
+            )
+
+            leaf_data = leaf_data.merge(df_clusters.T.reset_index())
+            
+            snp_trace = go.Heatmap(
+                z=df_clusters.values,
+                y=df_clusters.index.to_list(),
+                colorscale=cluster_col_list,
+                showlegend=False,
+                showscale=False,
+            )
+            figures.append(snp_trace)
+            subplot_heights.append(25)
+            
+            
+        figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace(
+            transcript=snp_transcript,
+            snp_query=snp_query,
+            figures=figures,
+            subplot_heights=subplot_heights,
+            sample_sets=sample_sets,
+            sample_query=sample_query,
+            analysis=analysis,
+            dendro_sample_id_order=dendro_sample_id_order,
+            snp_filter_min_maf=snp_filter_min_maf,
+            snp_colorscale=snp_colorscale,
+            snp_row_height=snp_row_height,
+            chunks=chunks,
+            inline_array=inline_array,
+        )
+        n_aa = df_haps.shape[0]
+
+        # Calculate total height based on subplot heights, plus a fixed
+        # additional component to allow for title, axes etc.
+        height = sum(subplot_heights) + 50
+        fig = self._dipclust_concat_subplots(
+            figures=figures,
+            width=width,
+            height=height,
+            row_heights=subplot_heights,
+            sample_sets=sample_sets,
+            sample_query=sample_query,  # Only uses query for title.
+            region=region,
+            n_snps=n_snps_cluster,
+        )
+
+        fig["layout"]["yaxis"]["title"] = f"Distance"
+        fig.update_layout(
+            title_font=dict(
+                size=title_font_size,
+            ),
+            legend=dict(itemsizing=legend_sizing, tracegroupgap=0),
+        )
+        
+        if snp_transcript:
+        # add lines to aa plot
+            aa_idx = len(figures)
+            fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1)
+            for i in range(n_aa):
+                fig.add_hline(y=i+0.5, line_width=1, line_color="grey", row=aa_idx, col=1)
+            
+            fig.update_xaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True)
+            fig.update_yaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True)
+
+        if show:
+            fig.show(renderer=renderer)
+            return None
+        else:
+            return fig, leaf_data, df_haps
+        
+        
+    def transcript_haplotypes(
+        self,
+        transcript,
+        sample_sets,
+        sample_query,
+        analysis,
+        snp_query,
+        chunks,
+        inline_array
+    ):
+        """
+        Extract haplotype calls for a given transcript.
+        """
+        
+        # Get SNP genotype allele counts for the transcript, applying snp_query
+        df_snps = self.snp_genotype_allele_counts(
+            transcript=transcript,
+            sample_query=sample_query,
+            sample_sets=sample_sets,
+            site_mask=None,
+            snp_query=snp_query,
+            chunks=chunks,
+            inline_array=inline_array
+        )
+
+        # Add a unique variant identifier: "position-alt_allele"
+        df_snps = df_snps.assign(
+            pos_alt=lambda x: x.position.astype(str) + "-" + x.alt_allele
+        )
+
+        # Get haplotypes for the transcript
+        ds_haps = self.haplotypes(
+            region=transcript,
+            sample_sets=sample_sets,
+            sample_query=sample_query,
+            analysis=analysis,
+            chunks=chunks,
+            inline_array=inline_array
+        )
+
+        # Convert genotype calls to haplotypes
+        haps = allel.GenotypeArray(ds_haps["call_genotype"].values).to_haplotypes()
+        h_pos = allel.SortedIndex(ds_haps["variant_position"].values)
+        h_alts = ds_haps["variant_allele"].values.astype(str)[:, 1]
+        h_pos_alts = np.array([f"{pos}-{h_alts[i]}" for i, pos in enumerate(h_pos)])
+        sample_ids = ds_haps["sample_id"].values
+
+        # Filter haplotypes to SNPs present in df_snps
+        df_snps = df_snps.query("pos_alt in @h_pos_alts")
+        label = df_snps.label.values
+        haps_bool = np.isin(h_pos_alts, df_snps.pos_alt)
+        haps = haps.compress(haps_bool)
+
+        # Build haplotype DataFrame
+        df_haps = pd.DataFrame(
+            haps,
+            index=label,
+            columns=_make_unique(np.repeat(sample_ids, 2))  # two haplotypes per sample
+        )
+
+        return df_haps
+
+    def _insert_hapclust_snp_trace(
+        self,
+        figures,
+        subplot_heights,
+        transcript,
+        analysis,
+        dendro_sample_id_order: np.ndarray,
+        snp_filter_min_maf: float,
+        snp_colorscale,
+        snp_query,
+        snp_row_height,
+        sample_sets,
+        sample_query,
+        chunks,
+        inline_array,
+    ):
+        from plotly import graph_objects as go
+        
+        # load genotype allele counts at SNP variants for each sample
+        df_haps = self.transcript_haplotypes(
+            transcript=transcript,
+            snp_query=snp_query,
+            sample_query=sample_query,
+            sample_sets=sample_sets,
+            analysis=analysis,
+            chunks=chunks,
+            inline_array=inline_array
+        )
+        
+        # set to diplotype cluster order
+        df_haps = df_haps.loc[
+            :, dendro_sample_id_order
+        ]
+
+        if snp_filter_min_maf:
+            df_haps = df_haps.assign(af=lambda x: x.sum(axis=1) / x.shape[1])
+            df_haps = df_haps.query("af > @snp_filter_min_maf").drop(columns="af")
+
+        if not df_haps.empty:
+            snp_trace = go.Heatmap(
+                z=df_haps.values,
+                y=df_haps.index.to_list(),
+                colorscale=snp_colorscale,
+                showlegend=False,
+                showscale=False,
+            )
+        else:
+            snp_trace = None
+            
+        if snp_trace:
+            figures.append(snp_trace)
+            subplot_heights.append(snp_row_height * df_haps.shape[0])
+        else:
+            print(
+                f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
+            )
+        return figures, subplot_heights, df_haps
+
+
+    def cut_dist_tree(self, dist, dist_samples, dendro_sample_id_order, linkage_method, cut_height, min_cluster_size, count_sort, distance_sort):
+        """
+        Create a one-row DataFrame with haplotype_ids as columns and cluster assignments as values
+
+        Parameters:
+        -----------
+        dist : ndarray
+            distance array
+        dist_samples : array-like
+            List/array of individual identifiers (haplotype_ids)
+        linkage_method : str
+            Method used to calculate the linkage matrix
+        cut_height : float
+            Height at which to cut the dendrogram
+        min_cluster_size : int, default=1
+            Minimum number of individuals required in a cluster to be included
+        count_sort : bool
+            Whether to sort clusters by count
+        distance_sort : bool
+            Whether to sort clusters by distance
+
+        Returns:
+        --------
+        pd.DataFrame
+            One-row DataFrame with haplotype_ids as columns and assigned cluster numbers (1...n) as values
+        """
+        from scipy.cluster.hierarchy import linkage, fcluster
+
+        Z = linkage(dist, method=linkage_method)
+
+        # Get cluster assignments for each individual
+        cluster_assignments = fcluster(
+            Z, cut_height, criterion='distance'
+        )
+        
+        # Create initial DataFrame
+        df = pd.DataFrame({
+            'sample_id': dist_samples,
+            'cluster_id': _filter_and_remap(cluster_assignments, x=min_cluster_size)
+        }).set_index('sample_id').T.loc[:, dendro_sample_id_order]
+
+        return df
+
+def _filter_and_remap(arr, x):
+    from collections import Counter
+    # Get unique values that appear >= x times
+    valid_values = [val for val, count in Counter(arr).items() if count >= x]
+    # Create mapping to 1, 2, 3, ..., n
+    mapping = {val: i + 1 for i, val in enumerate(sorted(valid_values))}
+    # Apply transformation
+    return np.array([mapping.get(val, 0) for val in arr])
+
+
+def _make_unique(values):
+    value_counts = {}
+    unique_values = []
+    
+    for value in values:
+        if value in value_counts:
+            value_counts[value] += 1
+            unique_values.append(f"{value}_{value_counts[value]}")
+        else:
+            value_counts[value] = 0
+            unique_values.append(f"{value}_{value_counts[value]}")
+    
+    return np.array(unique_values)
\ No newline at end of file

From df2bb6c6bce0d6426a764acb60cb25ec650f65ec Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Fri, 5 Sep 2025 22:32:05 +0100
Subject: [PATCH 02/15] use snp_effects()

---
 malariagen_data/anoph/hapclust.py | 139 +++++++++++++++++-------------
 1 file changed, 81 insertions(+), 58 deletions(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index bb9d80369..cf1134e1c 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -189,7 +189,7 @@ def plot_haplotype_clustering(
                 "n_snps": n_snps_used,
                 "dist": dist,
                 "dist_samples": phased_samples,
-                "leaf_data":leaf_data,
+                "leaf_data": leaf_data,
             }
 
     @doc(
@@ -388,24 +388,24 @@ def plot_haplotype_clustering_advanced(
 
         figures = [fig_dendro]
         subplot_heights = [dendrogram_height]
-        
+
         if cut_height and min_cluster_size:
             cluster_col_list = px.colors.sequential.Turbo.copy()
-            cluster_col_list.insert(0, 'white')
-            
+            cluster_col_list.insert(0, "white")
+
             df_clusters = self.cut_dist_tree(
-                dist=res['dist'], 
-                dist_samples=_make_unique(np.repeat(res['dist_samples'], 2)),
+                dist=res["dist"],
+                dist_samples=_make_unique(np.repeat(res["dist_samples"], 2)),
                 dendro_sample_id_order=dendro_sample_id_order,
-                linkage_method=linkage_method, 
+                linkage_method=linkage_method,
                 cut_height=cut_height,
-                min_cluster_size=min_cluster_size, 
-                count_sort=count_sort, 
-                distance_sort=distance_sort
+                min_cluster_size=min_cluster_size,
+                count_sort=count_sort,
+                distance_sort=distance_sort,
             )
 
             leaf_data = leaf_data.merge(df_clusters.T.reset_index())
-            
+
             snp_trace = go.Heatmap(
                 z=df_clusters.values,
                 y=df_clusters.index.to_list(),
@@ -415,8 +415,7 @@ def plot_haplotype_clustering_advanced(
             )
             figures.append(snp_trace)
             subplot_heights.append(25)
-            
-            
+
         figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace(
             transcript=snp_transcript,
             snp_query=snp_query,
@@ -448,31 +447,46 @@ def plot_haplotype_clustering_advanced(
             n_snps=n_snps_cluster,
         )
 
-        fig["layout"]["yaxis"]["title"] = f"Distance"
+        fig["layout"]["yaxis"]["title"] = "Distance"
         fig.update_layout(
             title_font=dict(
                 size=title_font_size,
             ),
             legend=dict(itemsizing=legend_sizing, tracegroupgap=0),
         )
-        
+
         if snp_transcript:
-        # add lines to aa plot
+            # add lines to aa plot
             aa_idx = len(figures)
             fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1)
             for i in range(n_aa):
-                fig.add_hline(y=i+0.5, line_width=1, line_color="grey", row=aa_idx, col=1)
-            
-            fig.update_xaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True)
-            fig.update_yaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True)
+                fig.add_hline(
+                    y=i + 0.5, line_width=1, line_color="grey", row=aa_idx, col=1
+                )
+
+            fig.update_xaxes(
+                showline=True,
+                linecolor="grey",
+                linewidth=1,
+                row=aa_idx,
+                col=1,
+                mirror=True,
+            )
+            fig.update_yaxes(
+                showline=True,
+                linecolor="grey",
+                linewidth=1,
+                row=aa_idx,
+                col=1,
+                mirror=True,
+            )
 
         if show:
             fig.show(renderer=renderer)
             return None
         else:
             return fig, leaf_data, df_haps
-        
-        
+
     def transcript_haplotypes(
         self,
         transcript,
@@ -481,25 +495,19 @@ def transcript_haplotypes(
         analysis,
         snp_query,
         chunks,
-        inline_array
+        inline_array,
     ):
         """
         Extract haplotype calls for a given transcript.
         """
-        
+
         # Get SNP genotype allele counts for the transcript, applying snp_query
-        df_snps = self.snp_genotype_allele_counts(
+        df_eff = self.snp_effects(
             transcript=transcript,
-            sample_query=sample_query,
-            sample_sets=sample_sets,
-            site_mask=None,
-            snp_query=snp_query,
-            chunks=chunks,
-            inline_array=inline_array
         )
 
         # Add a unique variant identifier: "position-alt_allele"
-        df_snps = df_snps.assign(
+        df_eff = df_eff.assign(
             pos_alt=lambda x: x.position.astype(str) + "-" + x.alt_allele
         )
 
@@ -510,7 +518,7 @@ def transcript_haplotypes(
             sample_query=sample_query,
             analysis=analysis,
             chunks=chunks,
-            inline_array=inline_array
+            inline_array=inline_array,
         )
 
         # Convert genotype calls to haplotypes
@@ -520,17 +528,17 @@ def transcript_haplotypes(
         h_pos_alts = np.array([f"{pos}-{h_alts[i]}" for i, pos in enumerate(h_pos)])
         sample_ids = ds_haps["sample_id"].values
 
-        # Filter haplotypes to SNPs present in df_snps
-        df_snps = df_snps.query("pos_alt in @h_pos_alts")
-        label = df_snps.label.values
-        haps_bool = np.isin(h_pos_alts, df_snps.pos_alt)
+        # Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff
+        df_eff = df_eff.query("pos_alt in @h_pos_alts")
+        label = df_eff.label.values
+        haps_bool = np.isin(h_pos_alts, df_eff.pos_alt)
         haps = haps.compress(haps_bool)
 
         # Build haplotype DataFrame
         df_haps = pd.DataFrame(
             haps,
             index=label,
-            columns=_make_unique(np.repeat(sample_ids, 2))  # two haplotypes per sample
+            columns=_make_unique(np.repeat(sample_ids, 2)),  # two haplotypes per sample
         )
 
         return df_haps
@@ -552,7 +560,7 @@ def _insert_hapclust_snp_trace(
         inline_array,
     ):
         from plotly import graph_objects as go
-        
+
         # load genotype allele counts at SNP variants for each sample
         df_haps = self.transcript_haplotypes(
             transcript=transcript,
@@ -561,13 +569,11 @@ def _insert_hapclust_snp_trace(
             sample_sets=sample_sets,
             analysis=analysis,
             chunks=chunks,
-            inline_array=inline_array
+            inline_array=inline_array,
         )
-        
+
         # set to diplotype cluster order
-        df_haps = df_haps.loc[
-            :, dendro_sample_id_order
-        ]
+        df_haps = df_haps.loc[:, dendro_sample_id_order]
 
         if snp_filter_min_maf:
             df_haps = df_haps.assign(af=lambda x: x.sum(axis=1) / x.shape[1])
@@ -583,7 +589,7 @@ def _insert_hapclust_snp_trace(
             )
         else:
             snp_trace = None
-            
+
         if snp_trace:
             figures.append(snp_trace)
             subplot_heights.append(snp_row_height * df_haps.shape[0])
@@ -593,8 +599,17 @@ def _insert_hapclust_snp_trace(
             )
         return figures, subplot_heights, df_haps
 
-
-    def cut_dist_tree(self, dist, dist_samples, dendro_sample_id_order, linkage_method, cut_height, min_cluster_size, count_sort, distance_sort):
+    def cut_dist_tree(
+        self,
+        dist,
+        dist_samples,
+        dendro_sample_id_order,
+        linkage_method,
+        cut_height,
+        min_cluster_size,
+        count_sort,
+        distance_sort,
+    ):
         """
         Create a one-row DataFrame with haplotype_ids as columns and cluster assignments as values
 
@@ -625,20 +640,28 @@ def cut_dist_tree(self, dist, dist_samples, dendro_sample_id_order, linkage_meth
         Z = linkage(dist, method=linkage_method)
 
         # Get cluster assignments for each individual
-        cluster_assignments = fcluster(
-            Z, cut_height, criterion='distance'
-        )
-        
+        cluster_assignments = fcluster(Z, cut_height, criterion="distance")
+
         # Create initial DataFrame
-        df = pd.DataFrame({
-            'sample_id': dist_samples,
-            'cluster_id': _filter_and_remap(cluster_assignments, x=min_cluster_size)
-        }).set_index('sample_id').T.loc[:, dendro_sample_id_order]
+        df = (
+            pd.DataFrame(
+                {
+                    "sample_id": dist_samples,
+                    "cluster_id": _filter_and_remap(
+                        cluster_assignments, x=min_cluster_size
+                    ),
+                }
+            )
+            .set_index("sample_id")
+            .T.loc[:, dendro_sample_id_order]
+        )
 
         return df
 
+
 def _filter_and_remap(arr, x):
     from collections import Counter
+
     # Get unique values that appear >= x times
     valid_values = [val for val, count in Counter(arr).items() if count >= x]
     # Create mapping to 1, 2, 3, ..., n
@@ -650,7 +673,7 @@ def _filter_and_remap(arr, x):
 def _make_unique(values):
     value_counts = {}
     unique_values = []
-    
+
     for value in values:
         if value in value_counts:
             value_counts[value] += 1
@@ -658,5 +681,5 @@ def _make_unique(values):
         else:
             value_counts[value] = 0
             unique_values.append(f"{value}_{value_counts[value]}")
-    
-    return np.array(unique_values)
\ No newline at end of file
+
+    return np.array(unique_values)

From eadb8b12c5e1e3f0e255000332edeadc6b68dbd3 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Fri, 5 Sep 2025 22:40:46 +0100
Subject: [PATCH 03/15] add labels

---
 malariagen_data/anoph/hapclust.py         |  10 +-
 notebooks/plot_haplotype_clustering.ipynb | 194 ++++++++++++++++++++--
 2 files changed, 192 insertions(+), 12 deletions(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index cf1134e1c..10194b29a 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from numpydoc_decorator import doc  # type: ignore
 
-from ..util import CacheMiss, check_types, pdist_abs_hamming
+from ..util import CacheMiss, check_types, pdist_abs_hamming, pandas_apply
 from ..plotly_dendrogram import plot_dendrogram
 from . import (
     base_params,
@@ -16,7 +16,7 @@
     hapclust_params,
 )
 from .snp_data import AnophelesSnpData
-from .snp_frq import AA_CHANGE_QUERY
+from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect
 from .hap_data import AnophelesHapData
 
 
@@ -504,6 +504,12 @@ def transcript_haplotypes(
         # Get SNP genotype allele counts for the transcript, applying snp_query
         df_eff = self.snp_effects(
             transcript=transcript,
+        ).query(snp_query)
+
+        df_eff["label"] = pandas_apply(
+            _make_snp_label_effect,
+            df_eff,
+            columns=["contig", "position", "ref_allele", "alt_allele", "aa_change"],
         )
 
         # Add a unique variant identifier: "position-alt_allele"
diff --git a/notebooks/plot_haplotype_clustering.ipynb b/notebooks/plot_haplotype_clustering.ipynb
index c6dbdf4f3..fd344a91e 100644
--- a/notebooks/plot_haplotype_clustering.ipynb
+++ b/notebooks/plot_haplotype_clustering.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "d3f73666",
    "metadata": {},
    "outputs": [],
@@ -21,10 +21,109 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "38abbb28",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/javascript": "'use strict';\n(function(root) {\n  function now() {\n    return new Date();\n  }\n\n  const force = true;\n\n  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n    root._bokeh_onload_callbacks = [];\n    root._bokeh_is_loading = undefined;\n  }\n\nconst JS_MIME_TYPE = 'application/javascript';\n  const HTML_MIME_TYPE = 'text/html';\n  const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n  const CLASS_NAME = 'output_bokeh rendered_html';\n\n  /**\n   * Render data to the DOM node\n   */\n  function render(props, node) {\n    const script = document.createElement(\"script\");\n    node.appendChild(script);\n  }\n\n  /**\n   * Handle when an output is cleared or removed\n   */\n  function handleClearOutput(event, handle) {\n    function drop(id) {\n      const view = Bokeh.index.get_by_id(id)\n      if (view != null) {\n        view.model.document.clear()\n        Bokeh.index.delete(view)\n      }\n    }\n\n    const cell = handle.cell;\n\n    const id = cell.output_area._bokeh_element_id;\n    const server_id = cell.output_area._bokeh_server_id;\n\n    // Clean up Bokeh references\n    if (id != null) {\n      drop(id)\n    }\n\n    if (server_id !== undefined) {\n      // Clean up Bokeh references\n      const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n      cell.notebook.kernel.execute(cmd_clean, {\n        iopub: {\n          output: function(msg) {\n            const id = msg.content.text.trim()\n            drop(id)\n          }\n        }\n      });\n      // Destroy server and session\n      const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n      cell.notebook.kernel.execute(cmd_destroy);\n    }\n  }\n\n  /**\n   * Handle when a new output is added\n   */\n  function handleAddOutput(event, handle) {\n    const output_area = handle.output_area;\n    const output = handle.output;\n\n    // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n    if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n      return\n    }\n\n    const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n    if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n      toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n      // store reference to embed id on output_area\n      output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n    }\n    if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n      const bk_div = document.createElement(\"div\");\n      bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n      const script_attrs = bk_div.children[0].attributes;\n      for (let i = 0; i < script_attrs.length; i++) {\n        toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n        toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n      }\n      // store reference to server id on output_area\n      output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n    }\n  }\n\n  function register_renderer(events, OutputArea) {\n\n    function append_mime(data, metadata, element) {\n      // create a DOM node to render to\n      const toinsert = this.create_output_subarea(\n        metadata,\n        CLASS_NAME,\n        EXEC_MIME_TYPE\n      );\n      this.keyboard_manager.register_events(toinsert);\n      // Render to node\n      const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n      render(props, toinsert[toinsert.length - 1]);\n      element.append(toinsert);\n      return toinsert\n    }\n\n    /* Handle when an output is cleared or removed */\n    events.on('clear_output.CodeCell', handleClearOutput);\n    events.on('delete.Cell', handleClearOutput);\n\n    /* Handle when a new output is added */\n    events.on('output_added.OutputArea', handleAddOutput);\n\n    /**\n     * Register the mime type and append_mime function with output_area\n     */\n    OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n      /* Is output safe? */\n      safe: true,\n      /* Index of renderer in `output_area.display_order` */\n      index: 0\n    });\n  }\n\n  // register the mime type if in Jupyter Notebook environment and previously unregistered\n  if (root.Jupyter !== undefined) {\n    const events = require('base/js/events');\n    const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n    if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n      register_renderer(events, OutputArea);\n    }\n  }\n  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n    root._bokeh_timeout = Date.now() + 5000;\n    root._bokeh_failed_load = false;\n  }\n\n  const NB_LOAD_WARNING = {'data': {'text/html':\n     \"<div style='background-color: #fdd'>\\n\"+\n     \"<p>\\n\"+\n     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n     \"</p>\\n\"+\n     \"<ul>\\n\"+\n     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n     \"</ul>\\n\"+\n     \"<code>\\n\"+\n     \"from bokeh.resources import INLINE\\n\"+\n     \"output_notebook(resources=INLINE)\\n\"+\n     \"</code>\\n\"+\n     \"</div>\"}};\n\n  function display_loaded(error = null) {\n    const el = document.getElementById(null);\n    if (el != null) {\n      const html = (() => {\n        if (typeof root.Bokeh === \"undefined\") {\n          if (error == null) {\n            return \"BokehJS is loading ...\";\n          } else {\n            return \"BokehJS failed to load.\";\n          }\n        } else {\n          const prefix = `BokehJS ${root.Bokeh.version}`;\n          if (error == null) {\n            return `${prefix} successfully loaded.`;\n          } else {\n            return `${prefix} <b>encountered errors</b> while loading and may not function as expected.`;\n          }\n        }\n      })();\n      el.innerHTML = html;\n\n      if (error != null) {\n        const wrapper = document.createElement(\"div\");\n        wrapper.style.overflow = \"auto\";\n        wrapper.style.height = \"5em\";\n        wrapper.style.resize = \"vertical\";\n        const content = document.createElement(\"div\");\n        content.style.fontFamily = \"monospace\";\n        content.style.whiteSpace = \"pre-wrap\";\n        content.style.backgroundColor = \"rgb(255, 221, 221)\";\n        content.textContent = error.stack ?? error.toString();\n        wrapper.append(content);\n        el.append(wrapper);\n      }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(() => display_loaded(error), 100);\n    }\n  }\n\n  function run_callbacks() {\n    try {\n      root._bokeh_onload_callbacks.forEach(function(callback) {\n        if (callback != null)\n          callback();\n      });\n    } finally {\n      delete root._bokeh_onload_callbacks\n    }\n    console.debug(\"Bokeh: all callbacks have finished\");\n  }\n\n  function load_libs(css_urls, js_urls, callback) {\n    if (css_urls == null) css_urls = [];\n    if (js_urls == null) js_urls = [];\n\n    root._bokeh_onload_callbacks.push(callback);\n    if (root._bokeh_is_loading > 0) {\n      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n      return null;\n    }\n    if (js_urls == null || js_urls.length === 0) {\n      run_callbacks();\n      return null;\n    }\n    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n    root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n    function on_load() {\n      root._bokeh_is_loading--;\n      if (root._bokeh_is_loading === 0) {\n        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n        run_callbacks()\n      }\n    }\n\n    function on_error(url) {\n      console.error(\"failed to load \" + url);\n    }\n\n    for (let i = 0; i < css_urls.length; i++) {\n      const url = css_urls[i];\n      const element = document.createElement(\"link\");\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.rel = \"stylesheet\";\n      element.type = \"text/css\";\n      element.href = url;\n      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n      document.body.appendChild(element);\n    }\n\n    for (let i = 0; i < js_urls.length; i++) {\n      const url = js_urls[i];\n      const element = document.createElement('script');\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.async = false;\n      element.src = url;\n      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n      document.head.appendChild(element);\n    }\n  };\n\n  function inject_raw_css(css) {\n    const element = document.createElement(\"style\");\n    element.appendChild(document.createTextNode(css));\n    document.body.appendChild(element);\n  }\n\n  const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.6.3.min.js\"];\n  const css_urls = [];\n\n  const inline_js = [    function(Bokeh) {\n      Bokeh.set_log_level(\"info\");\n    },\nfunction(Bokeh) {\n    }\n  ];\n\n  function run_inline_js() {\n    if (root.Bokeh !== undefined || force === true) {\n      try {\n            for (let i = 0; i < inline_js.length; i++) {\n      inline_js[i].call(root, root.Bokeh);\n    }\n\n      } catch (error) {throw error;\n      }} else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(run_inline_js, 100);\n    } else if (!root._bokeh_failed_load) {\n      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n      root._bokeh_failed_load = true;\n    } else if (force !== true) {\n      const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n    }\n  }\n\n  if (root._bokeh_is_loading === 0) {\n    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n    run_inline_js();\n  } else {\n    load_libs(css_urls, js_urls, function() {\n      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n      run_inline_js();\n    });\n  }\n}(window));",
+      "application/vnd.bokehjs_load.v0+json": ""
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "            <table class=\"malariagen-ag3\">\n",
+       "                <thead>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\" colspan=\"2\">MalariaGEN Ag3 API client</th>\n",
+       "                    </tr>\n",
+       "                    <tr><td colspan=\"2\" style=\"text-align: left\">\n",
+       "                        Please note that data are subject to terms of use,\n",
+       "                        for more information see <a href=\"https://www.malariagen.net/data\">\n",
+       "                        the MalariaGEN website</a> or contact support@malariagen.net.\n",
+       "                        See also the <a href=\"https://malariagen.github.io/malariagen-data-python/v15.4.0.post6+f591dba1/Ag3.html\">Ag3 API docs</a>.\n",
+       "                    </td></tr>\n",
+       "                </thead>\n",
+       "                <tbody>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            Storage URL\n",
+       "                        </th>\n",
+       "                        <td>simplecache::gs://vo_agam_release_master_us_central1</td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            Data releases available\n",
+       "                        </th>\n",
+       "                        <td>3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16</td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            Results cache\n",
+       "                        </th>\n",
+       "                        <td>/home/sanj/Software/malariagen-data-python/notebooks/results_cache</td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            Cohorts analysis\n",
+       "                        </th>\n",
+       "                        <td>20250502</td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            AIM analysis\n",
+       "                        </th>\n",
+       "                        <td>20220528</td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            Site filters analysis\n",
+       "                        </th>\n",
+       "                        <td>dt_20200416</td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            Software version\n",
+       "                        </th>\n",
+       "                        <td>malariagen_data 15.4.0.post6+f591dba1</td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th style=\"text-align: left\">\n",
+       "                            Client location\n",
+       "                        </th>\n",
+       "                        <td>England, United Kingdom</td>\n",
+       "                    </tr>\n",
+       "                </tbody>\n",
+       "            </table>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<MalariaGEN Ag3 API client>\n",
+       "Storage URL             : simplecache::gs://vo_agam_release_master_us_central1\n",
+       "Data releases available : 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16\n",
+       "Results cache           : /home/sanj/Software/malariagen-data-python/notebooks/results_cache\n",
+       "Cohorts analysis        : 20250502\n",
+       "AIM analysis            : 20220528\n",
+       "Site filters analysis   : dt_20200416\n",
+       "Software version        : malariagen_data 15.4.0.post6+f591dba1\n",
+       "Client location         : England, United Kingdom\n",
+       "---\n",
+       "Please note that data are subject to terms of use,\n",
+       "for more information see https://www.malariagen.net/data\n",
+       "or contact support@malariagen.net. For API documentation see \n",
+       "https://malariagen.github.io/malariagen-data-python/v15.4.0.post6+f591dba1/Ag3.html"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "ag3 = malariagen_data.Ag3(\n",
     "    \"simplecache::gs://vo_agam_release_master_us_central1\",\n",
@@ -327,6 +426,86 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "51846e18",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                     \r"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4daea74540a74823be0621dcc0afd499",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Load haplotypes:   0%|          | 0/67 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                           \r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sanj/apps/miniforge3/lib/python3.12/site-packages/numba/np/ufunc/parallel.py:371: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n",
+      "  warnings.warn(problem)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                           \r"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'DataFrame' object has no attribute 'label'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
+      "\u001b[32m/tmp/ipykernel_537773/530656911.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m ag3.plot_haplotype_clustering_advanced(\n\u001b[32m      2\u001b[39m     region=\u001b[33m\"2R:28,480,000-28,490,000\"\u001b[39m,\n\u001b[32m      3\u001b[39m     sample_sets=[\u001b[33m\"3.0\"\u001b[39m],\n\u001b[32m      4\u001b[39m     sample_query=\u001b[33m\"taxon == 'arabiensis'\"\u001b[39m,\n",
+      "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, region, analysis, snp_transcript, snp_colorscale, snp_filter_min_maf, snp_query, sample_sets, sample_query, sample_query_options, random_seed, cohort_size, cut_height, min_cluster_size, color, symbol, linkage_method, count_sort, distance_sort, title, title_font_size, width, dendrogram_height, snp_row_height, show, renderer, render_mode, leaf_y, marker_size, line_width, line_color, color_discrete_sequence, color_discrete_map, category_orders, legend_sizing, chunks, inline_array)\u001b[39m\n\u001b[32m    415\u001b[39m             )\n\u001b[32m    416\u001b[39m             figures.append(snp_trace)\n\u001b[32m    417\u001b[39m             subplot_heights.append(\u001b[32m25\u001b[39m)\n\u001b[32m    418\u001b[39m \n\u001b[32m--> \u001b[39m\u001b[32m419\u001b[39m         figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace(\n\u001b[32m    420\u001b[39m             transcript=snp_transcript,\n\u001b[32m    421\u001b[39m             snp_query=snp_query,\n\u001b[32m    422\u001b[39m             figures=figures,\n",
+      "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, figures, subplot_heights, transcript, analysis, dendro_sample_id_order, snp_filter_min_maf, snp_colorscale, snp_query, snp_row_height, sample_sets, sample_query, chunks, inline_array)\u001b[39m\n\u001b[32m    561\u001b[39m     ):\n\u001b[32m    562\u001b[39m         \u001b[38;5;28;01mfrom\u001b[39;00m plotly \u001b[38;5;28;01mimport\u001b[39;00m graph_objects \u001b[38;5;28;01mas\u001b[39;00m go\n\u001b[32m    563\u001b[39m \n\u001b[32m    564\u001b[39m         \u001b[38;5;66;03m# load genotype allele counts at SNP variants for each sample\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m565\u001b[39m         df_haps = self.transcript_haplotypes(\n\u001b[32m    566\u001b[39m             transcript=transcript,\n\u001b[32m    567\u001b[39m             snp_query=snp_query,\n\u001b[32m    568\u001b[39m             sample_query=sample_query,\n",
+      "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, transcript, sample_sets, sample_query, analysis, snp_query, chunks, inline_array)\u001b[39m\n\u001b[32m    529\u001b[39m         sample_ids = ds_haps[\u001b[33m\"sample_id\"\u001b[39m].values\n\u001b[32m    530\u001b[39m \n\u001b[32m    531\u001b[39m         \u001b[38;5;66;03m# Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff\u001b[39;00m\n\u001b[32m    532\u001b[39m         df_eff = df_eff.query(\u001b[33m\"pos_alt in @h_pos_alts\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m533\u001b[39m         label = df_eff.label.values\n\u001b[32m    534\u001b[39m         haps_bool = np.isin(h_pos_alts, df_eff.pos_alt)\n\u001b[32m    535\u001b[39m         haps = haps.compress(haps_bool)\n\u001b[32m    536\u001b[39m \n",
+      "\u001b[32m~/apps/miniforge3/lib/python3.12/site-packages/pandas/core/generic.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m   6295\u001b[39m             \u001b[38;5;28;01mand\u001b[39;00m name \u001b[38;5;28;01mnot\u001b[39;00m \u001b[38;5;28;01min\u001b[39;00m self._accessors\n\u001b[32m   6296\u001b[39m             \u001b[38;5;28;01mand\u001b[39;00m self._info_axis._can_hold_identifiers_and_holds_name(name)\n\u001b[32m   6297\u001b[39m         ):\n\u001b[32m   6298\u001b[39m             \u001b[38;5;28;01mreturn\u001b[39;00m self[name]\n\u001b[32m-> \u001b[39m\u001b[32m6299\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m object.__getattribute__(self, name)\n",
+      "\u001b[31mAttributeError\u001b[39m: 'DataFrame' object has no attribute 'label'"
+     ]
+    }
+   ],
+   "source": [
+    "ag3.plot_haplotype_clustering_advanced(\n",
+    "    region=\"2R:28,480,000-28,490,000\",\n",
+    "    sample_sets=[\"3.0\"],\n",
+    "    sample_query=\"taxon == 'arabiensis'\",\n",
+    "    analysis=\"gamb_colu_arab\",\n",
+    "    # symbol=new_cohorts,\n",
+    "    color=\"year\",\n",
+    "    cohort_size=None,\n",
+    "    snp_transcript=\"AGAP002862-RA\",\n",
+    "    # width=1000,\n",
+    "    # height=400,\n",
+    ")"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -421,7 +600,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -435,12 +614,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "3b9ddb1005cd06989fd869b9e3d566470f1be01faa610bb17d64e58e32302e8b"
-   }
+   "version": "3.12.8"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {

From 510b639724bed155a63cff1b9770d313eb911060 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 7 Sep 2025 16:18:58 +0100
Subject: [PATCH 04/15] fixes for hap data

---
 malariagen_data/anoph/hapclust.py         |  83 ++++++---
 malariagen_data/anoph/hapclust_params.py  |  12 ++
 notebooks/plot_haplotype_clustering.ipynb | 197 +++-------------------
 3 files changed, 93 insertions(+), 199 deletions(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 10194b29a..1a6532ee6 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -48,6 +48,7 @@ def plot_haplotype_clustering(
         color: plotly_params.color = None,
         symbol: plotly_params.symbol = None,
         linkage_method: hapclust_params.linkage_method = hapclust_params.linkage_method_default,
+        distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default,
         count_sort: Optional[tree_params.count_sort] = None,
         distance_sort: Optional[tree_params.distance_sort] = None,
         title: plotly_params.title = True,
@@ -90,6 +91,7 @@ def plot_haplotype_clustering(
         dist, phased_samples, n_snps_used = self.haplotype_pairwise_distances(
             region=region,
             analysis=analysis,
+            distance_metric=distance_metric,
             sample_sets=sample_sets,
             sample_query=sample_query,
             sample_query_options=sample_query_options,
@@ -168,7 +170,7 @@ def plot_haplotype_clustering(
                 leaf_color_discrete_map=color_discrete_map_prepped,
                 leaf_category_orders=category_orders_prepped,
                 template="simple_white",
-                y_axis_title="Distance (no. SNPs)",
+                y_axis_title=f"Distance ({distance_metric})",
                 y_axis_buffer=1,
             )
 
@@ -205,6 +207,7 @@ def plot_haplotype_clustering(
     def haplotype_pairwise_distances(
         self,
         region: base_params.regions,
+        distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default,
         analysis: hap_params.analysis = base_params.DEFAULT,
         sample_sets: Optional[base_params.sample_sets] = None,
         sample_query: Optional[base_params.sample_query] = None,
@@ -223,6 +226,7 @@ def haplotype_pairwise_distances(
         region_prepped = self._prep_region_cache_param(region=region)
         params = dict(
             region=region_prepped,
+            distance_metric=distance_metric,
             analysis=analysis,
             sample_sets=sample_sets_prepped,
             sample_query=sample_query,
@@ -252,6 +256,7 @@ def _haplotype_pairwise_distances(
         self,
         *,
         region,
+        distance_metric,
         analysis,
         sample_sets,
         sample_query,
@@ -295,10 +300,27 @@ def _haplotype_pairwise_distances(
         # to allow these to be saved to the results cache.
         phased_samples = ds_haps["sample_id"].values.astype("U")
 
+        # Number of sites
+        n_total_sites = region["end"] - region["start"] + 1
+
+        # Adjust distances if dxy requested
+        if distance_metric == "dxy":
+            # Normalize by total sites (common definition of dxy)
+            dist = dist / n_total_sites
+        elif distance_metric == "hamming":
+            # Leave as raw SNP differences
+            pass
+        else:
+            raise ValueError(
+                f"Unsupported distance_metric: {distance_metric}. "
+                "Choose from {'hamming', 'dxy'}."
+            )
+
         return dict(
             dist=dist,
             phased_samples=phased_samples,
-            n_snps=np.array(ht.shape[0]),
+            n_snps=np.array(n_total_sites),
+            n_seg_sites=np.array(ht.shape[0]),
         )
 
     def plot_haplotype_clustering_advanced(
@@ -314,8 +336,10 @@ def plot_haplotype_clustering_advanced(
         sample_query_options=None,
         random_seed=42,
         cohort_size=None,
-        cut_height=None,
+        distance_metric="hamming",
+        cluster_threshold=None,
         min_cluster_size=None,
+        cluster_criterion="distance",
         color=None,
         symbol=None,
         linkage_method="complete",
@@ -358,6 +382,7 @@ def plot_haplotype_clustering_advanced(
             count_sort=count_sort,
             cohort_size=cohort_size,
             distance_sort=distance_sort,
+            distance_metric=distance_metric,
             linkage_method=linkage_method,
             color=color,
             symbol=symbol,
@@ -389,23 +414,27 @@ def plot_haplotype_clustering_advanced(
         figures = [fig_dendro]
         subplot_heights = [dendrogram_height]
 
-        if cut_height and min_cluster_size:
-            cluster_col_list = px.colors.sequential.Turbo.copy()
-            cluster_col_list.insert(0, "white")
-
+        if cluster_threshold and min_cluster_size:
             df_clusters = self.cut_dist_tree(
                 dist=res["dist"],
                 dist_samples=_make_unique(np.repeat(res["dist_samples"], 2)),
                 dendro_sample_id_order=dendro_sample_id_order,
                 linkage_method=linkage_method,
-                cut_height=cut_height,
+                cluster_threshold=cluster_threshold,
                 min_cluster_size=min_cluster_size,
-                count_sort=count_sort,
-                distance_sort=distance_sort,
+                cluster_criterion=cluster_criterion,
             )
 
             leaf_data = leaf_data.merge(df_clusters.T.reset_index())
 
+            # if more than 8 clusters, use px.colors.qualitative.Alphabet
+            if df_clusters.max().max() > 8:
+                cluster_col_list = px.colors.qualitative.Alphabet.copy()
+                cluster_col_list.insert(0, "white")
+            else:
+                cluster_col_list = px.colors.qualitative.Dark2.copy()
+            cluster_col_list.insert(0, "white")
+
             snp_trace = go.Heatmap(
                 z=df_clusters.values,
                 y=df_clusters.index.to_list(),
@@ -455,7 +484,7 @@ def plot_haplotype_clustering_advanced(
             legend=dict(itemsizing=legend_sizing, tracegroupgap=0),
         )
 
-        if snp_transcript:
+        if snp_transcript and n_aa > 0:
             # add lines to aa plot
             aa_idx = len(figures)
             fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1)
@@ -502,9 +531,13 @@ def transcript_haplotypes(
         """
 
         # Get SNP genotype allele counts for the transcript, applying snp_query
-        df_eff = self.snp_effects(
-            transcript=transcript,
-        ).query(snp_query)
+        df_eff = (
+            self.snp_effects(
+                transcript=transcript,
+            )
+            .query(snp_query)
+            .reset_index(drop=True)
+        )
 
         df_eff["label"] = pandas_apply(
             _make_snp_label_effect,
@@ -611,10 +644,9 @@ def cut_dist_tree(
         dist_samples,
         dendro_sample_id_order,
         linkage_method,
-        cut_height,
+        cluster_threshold,
+        cluster_criterion,
         min_cluster_size,
-        count_sort,
-        distance_sort,
     ):
         """
         Create a one-row DataFrame with haplotype_ids as columns and cluster assignments as values
@@ -627,14 +659,17 @@ def cut_dist_tree(
             List/array of individual identifiers (haplotype_ids)
         linkage_method : str
             Method used to calculate the linkage matrix
-        cut_height : float
+        cluster_threshold : float
             Height at which to cut the dendrogram
         min_cluster_size : int, default=1
             Minimum number of individuals required in a cluster to be included
-        count_sort : bool
-            Whether to sort clusters by count
-        distance_sort : bool
-            Whether to sort clusters by distance
+        dendro_sample_id_order : array-like
+            List/array of individual identifiers (haplotype_ids) in the order they appear in
+            the dendrogram
+        cluster_criterion : str, default='distance'
+            The cluster_criterion to use in forming flat clusters. One of
+            'inconsistent', 'distance', 'maxclust', 'maxclust_monochronic', 'monocrit'
+            See scipy.cluster.hierarchy.fcluster for details.
 
         Returns:
         --------
@@ -646,7 +681,9 @@ def cut_dist_tree(
         Z = linkage(dist, method=linkage_method)
 
         # Get cluster assignments for each individual
-        cluster_assignments = fcluster(Z, cut_height, criterion="distance")
+        cluster_assignments = fcluster(
+            Z, t=cluster_threshold, criterion=cluster_criterion
+        )
 
         # Create initial DataFrame
         df = (
diff --git a/malariagen_data/anoph/hapclust_params.py b/malariagen_data/anoph/hapclust_params.py
index 4f53fc823..08bc67351 100644
--- a/malariagen_data/anoph/hapclust_params.py
+++ b/malariagen_data/anoph/hapclust_params.py
@@ -1,5 +1,17 @@
 """Parameters for haplotype clustering functions."""
 
 from .clustering_params import linkage_method
+from typing_extensions import Annotated, TypeAlias, Literal
 
 linkage_method_default: linkage_method = "single"
+
+distance_metric: TypeAlias = Annotated[
+    Literal["hamming", "dxy"],
+    """
+    The distance metric to use for calculating pairwise distances between haplotypes.
+    'hamming' computes the Hamming distance (number of differing SNPs) between haplotypes.
+    'dxy' computes the average number of nucleotide differences per site between haplotypes.
+    """,
+]
+
+distance_metric_default: str = "hamming"
diff --git a/notebooks/plot_haplotype_clustering.ipynb b/notebooks/plot_haplotype_clustering.ipynb
index fd344a91e..f035681b9 100644
--- a/notebooks/plot_haplotype_clustering.ipynb
+++ b/notebooks/plot_haplotype_clustering.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "d3f73666",
    "metadata": {},
    "outputs": [],
@@ -21,109 +21,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "38abbb28",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/javascript": "'use strict';\n(function(root) {\n  function now() {\n    return new Date();\n  }\n\n  const force = true;\n\n  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n    root._bokeh_onload_callbacks = [];\n    root._bokeh_is_loading = undefined;\n  }\n\nconst JS_MIME_TYPE = 'application/javascript';\n  const HTML_MIME_TYPE = 'text/html';\n  const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n  const CLASS_NAME = 'output_bokeh rendered_html';\n\n  /**\n   * Render data to the DOM node\n   */\n  function render(props, node) {\n    const script = document.createElement(\"script\");\n    node.appendChild(script);\n  }\n\n  /**\n   * Handle when an output is cleared or removed\n   */\n  function handleClearOutput(event, handle) {\n    function drop(id) {\n      const view = Bokeh.index.get_by_id(id)\n      if (view != null) {\n        view.model.document.clear()\n        Bokeh.index.delete(view)\n      }\n    }\n\n    const cell = handle.cell;\n\n    const id = cell.output_area._bokeh_element_id;\n    const server_id = cell.output_area._bokeh_server_id;\n\n    // Clean up Bokeh references\n    if (id != null) {\n      drop(id)\n    }\n\n    if (server_id !== undefined) {\n      // Clean up Bokeh references\n      const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n      cell.notebook.kernel.execute(cmd_clean, {\n        iopub: {\n          output: function(msg) {\n            const id = msg.content.text.trim()\n            drop(id)\n          }\n        }\n      });\n      // Destroy server and session\n      const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n      cell.notebook.kernel.execute(cmd_destroy);\n    }\n  }\n\n  /**\n   * Handle when a new output is added\n   */\n  function handleAddOutput(event, handle) {\n    const output_area = handle.output_area;\n    const output = handle.output;\n\n    // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n    if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n      return\n    }\n\n    const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n    if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n      toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n      // store reference to embed id on output_area\n      output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n    }\n    if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n      const bk_div = document.createElement(\"div\");\n      bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n      const script_attrs = bk_div.children[0].attributes;\n      for (let i = 0; i < script_attrs.length; i++) {\n        toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n        toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n      }\n      // store reference to server id on output_area\n      output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n    }\n  }\n\n  function register_renderer(events, OutputArea) {\n\n    function append_mime(data, metadata, element) {\n      // create a DOM node to render to\n      const toinsert = this.create_output_subarea(\n        metadata,\n        CLASS_NAME,\n        EXEC_MIME_TYPE\n      );\n      this.keyboard_manager.register_events(toinsert);\n      // Render to node\n      const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n      render(props, toinsert[toinsert.length - 1]);\n      element.append(toinsert);\n      return toinsert\n    }\n\n    /* Handle when an output is cleared or removed */\n    events.on('clear_output.CodeCell', handleClearOutput);\n    events.on('delete.Cell', handleClearOutput);\n\n    /* Handle when a new output is added */\n    events.on('output_added.OutputArea', handleAddOutput);\n\n    /**\n     * Register the mime type and append_mime function with output_area\n     */\n    OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n      /* Is output safe? */\n      safe: true,\n      /* Index of renderer in `output_area.display_order` */\n      index: 0\n    });\n  }\n\n  // register the mime type if in Jupyter Notebook environment and previously unregistered\n  if (root.Jupyter !== undefined) {\n    const events = require('base/js/events');\n    const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n    if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n      register_renderer(events, OutputArea);\n    }\n  }\n  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n    root._bokeh_timeout = Date.now() + 5000;\n    root._bokeh_failed_load = false;\n  }\n\n  const NB_LOAD_WARNING = {'data': {'text/html':\n     \"<div style='background-color: #fdd'>\\n\"+\n     \"<p>\\n\"+\n     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n     \"</p>\\n\"+\n     \"<ul>\\n\"+\n     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n     \"</ul>\\n\"+\n     \"<code>\\n\"+\n     \"from bokeh.resources import INLINE\\n\"+\n     \"output_notebook(resources=INLINE)\\n\"+\n     \"</code>\\n\"+\n     \"</div>\"}};\n\n  function display_loaded(error = null) {\n    const el = document.getElementById(null);\n    if (el != null) {\n      const html = (() => {\n        if (typeof root.Bokeh === \"undefined\") {\n          if (error == null) {\n            return \"BokehJS is loading ...\";\n          } else {\n            return \"BokehJS failed to load.\";\n          }\n        } else {\n          const prefix = `BokehJS ${root.Bokeh.version}`;\n          if (error == null) {\n            return `${prefix} successfully loaded.`;\n          } else {\n            return `${prefix} <b>encountered errors</b> while loading and may not function as expected.`;\n          }\n        }\n      })();\n      el.innerHTML = html;\n\n      if (error != null) {\n        const wrapper = document.createElement(\"div\");\n        wrapper.style.overflow = \"auto\";\n        wrapper.style.height = \"5em\";\n        wrapper.style.resize = \"vertical\";\n        const content = document.createElement(\"div\");\n        content.style.fontFamily = \"monospace\";\n        content.style.whiteSpace = \"pre-wrap\";\n        content.style.backgroundColor = \"rgb(255, 221, 221)\";\n        content.textContent = error.stack ?? error.toString();\n        wrapper.append(content);\n        el.append(wrapper);\n      }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(() => display_loaded(error), 100);\n    }\n  }\n\n  function run_callbacks() {\n    try {\n      root._bokeh_onload_callbacks.forEach(function(callback) {\n        if (callback != null)\n          callback();\n      });\n    } finally {\n      delete root._bokeh_onload_callbacks\n    }\n    console.debug(\"Bokeh: all callbacks have finished\");\n  }\n\n  function load_libs(css_urls, js_urls, callback) {\n    if (css_urls == null) css_urls = [];\n    if (js_urls == null) js_urls = [];\n\n    root._bokeh_onload_callbacks.push(callback);\n    if (root._bokeh_is_loading > 0) {\n      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n      return null;\n    }\n    if (js_urls == null || js_urls.length === 0) {\n      run_callbacks();\n      return null;\n    }\n    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n    root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n    function on_load() {\n      root._bokeh_is_loading--;\n      if (root._bokeh_is_loading === 0) {\n        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n        run_callbacks()\n      }\n    }\n\n    function on_error(url) {\n      console.error(\"failed to load \" + url);\n    }\n\n    for (let i = 0; i < css_urls.length; i++) {\n      const url = css_urls[i];\n      const element = document.createElement(\"link\");\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.rel = \"stylesheet\";\n      element.type = \"text/css\";\n      element.href = url;\n      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n      document.body.appendChild(element);\n    }\n\n    for (let i = 0; i < js_urls.length; i++) {\n      const url = js_urls[i];\n      const element = document.createElement('script');\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.async = false;\n      element.src = url;\n      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n      document.head.appendChild(element);\n    }\n  };\n\n  function inject_raw_css(css) {\n    const element = document.createElement(\"style\");\n    element.appendChild(document.createTextNode(css));\n    document.body.appendChild(element);\n  }\n\n  const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.6.3.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.6.3.min.js\"];\n  const css_urls = [];\n\n  const inline_js = [    function(Bokeh) {\n      Bokeh.set_log_level(\"info\");\n    },\nfunction(Bokeh) {\n    }\n  ];\n\n  function run_inline_js() {\n    if (root.Bokeh !== undefined || force === true) {\n      try {\n            for (let i = 0; i < inline_js.length; i++) {\n      inline_js[i].call(root, root.Bokeh);\n    }\n\n      } catch (error) {throw error;\n      }} else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(run_inline_js, 100);\n    } else if (!root._bokeh_failed_load) {\n      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n      root._bokeh_failed_load = true;\n    } else if (force !== true) {\n      const cell = $(document.getElementById(null)).parents('.cell').data().cell;\n      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n    }\n  }\n\n  if (root._bokeh_is_loading === 0) {\n    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n    run_inline_js();\n  } else {\n    load_libs(css_urls, js_urls, function() {\n      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n      run_inline_js();\n    });\n  }\n}(window));",
-      "application/vnd.bokehjs_load.v0+json": ""
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "            <table class=\"malariagen-ag3\">\n",
-       "                <thead>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\" colspan=\"2\">MalariaGEN Ag3 API client</th>\n",
-       "                    </tr>\n",
-       "                    <tr><td colspan=\"2\" style=\"text-align: left\">\n",
-       "                        Please note that data are subject to terms of use,\n",
-       "                        for more information see <a href=\"https://www.malariagen.net/data\">\n",
-       "                        the MalariaGEN website</a> or contact support@malariagen.net.\n",
-       "                        See also the <a href=\"https://malariagen.github.io/malariagen-data-python/v15.4.0.post6+f591dba1/Ag3.html\">Ag3 API docs</a>.\n",
-       "                    </td></tr>\n",
-       "                </thead>\n",
-       "                <tbody>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            Storage URL\n",
-       "                        </th>\n",
-       "                        <td>simplecache::gs://vo_agam_release_master_us_central1</td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            Data releases available\n",
-       "                        </th>\n",
-       "                        <td>3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16</td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            Results cache\n",
-       "                        </th>\n",
-       "                        <td>/home/sanj/Software/malariagen-data-python/notebooks/results_cache</td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            Cohorts analysis\n",
-       "                        </th>\n",
-       "                        <td>20250502</td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            AIM analysis\n",
-       "                        </th>\n",
-       "                        <td>20220528</td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            Site filters analysis\n",
-       "                        </th>\n",
-       "                        <td>dt_20200416</td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            Software version\n",
-       "                        </th>\n",
-       "                        <td>malariagen_data 15.4.0.post6+f591dba1</td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th style=\"text-align: left\">\n",
-       "                            Client location\n",
-       "                        </th>\n",
-       "                        <td>England, United Kingdom</td>\n",
-       "                    </tr>\n",
-       "                </tbody>\n",
-       "            </table>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<MalariaGEN Ag3 API client>\n",
-       "Storage URL             : simplecache::gs://vo_agam_release_master_us_central1\n",
-       "Data releases available : 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16\n",
-       "Results cache           : /home/sanj/Software/malariagen-data-python/notebooks/results_cache\n",
-       "Cohorts analysis        : 20250502\n",
-       "AIM analysis            : 20220528\n",
-       "Site filters analysis   : dt_20200416\n",
-       "Software version        : malariagen_data 15.4.0.post6+f591dba1\n",
-       "Client location         : England, United Kingdom\n",
-       "---\n",
-       "Please note that data are subject to terms of use,\n",
-       "for more information see https://www.malariagen.net/data\n",
-       "or contact support@malariagen.net. For API documentation see \n",
-       "https://malariagen.github.io/malariagen-data-python/v15.4.0.post6+f591dba1/Ag3.html"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "ag3 = malariagen_data.Ag3(\n",
     "    \"simplecache::gs://vo_agam_release_master_us_central1\",\n",
@@ -413,7 +314,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_haplotype_clustering(\n",
+    "ag3.plot_haplotype_clustering_advanced(\n",
     "    region=\"2R:28,480,000-28,490,000\",\n",
     "    sample_sets=[\"3.0\"],\n",
     "    sample_query=\"taxon == 'arabiensis'\",\n",
@@ -421,88 +322,32 @@
     "    symbol=new_cohorts,\n",
     "    color=\"year\",\n",
     "    cohort_size=None,\n",
-    "    width=1000,\n",
-    "    height=400,\n",
+    "    snp_transcript=\"AGAP002863-RA\",\n",
+    "    distance_metric='hamming',\n",
+    "    snp_filter_min_maf=0.05\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "51846e18",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                     \r"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4daea74540a74823be0621dcc0afd499",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Load haplotypes:   0%|          | 0/67 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                           \r"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/sanj/apps/miniforge3/lib/python3.12/site-packages/numba/np/ufunc/parallel.py:371: NumbaWarning: \u001b[1mThe TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.\u001b[0m\n",
-      "  warnings.warn(problem)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                                           \r"
-     ]
-    },
-    {
-     "ename": "AttributeError",
-     "evalue": "'DataFrame' object has no attribute 'label'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
-      "\u001b[32m/tmp/ipykernel_537773/530656911.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m ag3.plot_haplotype_clustering_advanced(\n\u001b[32m      2\u001b[39m     region=\u001b[33m\"2R:28,480,000-28,490,000\"\u001b[39m,\n\u001b[32m      3\u001b[39m     sample_sets=[\u001b[33m\"3.0\"\u001b[39m],\n\u001b[32m      4\u001b[39m     sample_query=\u001b[33m\"taxon == 'arabiensis'\"\u001b[39m,\n",
-      "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, region, analysis, snp_transcript, snp_colorscale, snp_filter_min_maf, snp_query, sample_sets, sample_query, sample_query_options, random_seed, cohort_size, cut_height, min_cluster_size, color, symbol, linkage_method, count_sort, distance_sort, title, title_font_size, width, dendrogram_height, snp_row_height, show, renderer, render_mode, leaf_y, marker_size, line_width, line_color, color_discrete_sequence, color_discrete_map, category_orders, legend_sizing, chunks, inline_array)\u001b[39m\n\u001b[32m    415\u001b[39m             )\n\u001b[32m    416\u001b[39m             figures.append(snp_trace)\n\u001b[32m    417\u001b[39m             subplot_heights.append(\u001b[32m25\u001b[39m)\n\u001b[32m    418\u001b[39m \n\u001b[32m--> \u001b[39m\u001b[32m419\u001b[39m         figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace(\n\u001b[32m    420\u001b[39m             transcript=snp_transcript,\n\u001b[32m    421\u001b[39m             snp_query=snp_query,\n\u001b[32m    422\u001b[39m             figures=figures,\n",
-      "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, figures, subplot_heights, transcript, analysis, dendro_sample_id_order, snp_filter_min_maf, snp_colorscale, snp_query, snp_row_height, sample_sets, sample_query, chunks, inline_array)\u001b[39m\n\u001b[32m    561\u001b[39m     ):\n\u001b[32m    562\u001b[39m         \u001b[38;5;28;01mfrom\u001b[39;00m plotly \u001b[38;5;28;01mimport\u001b[39;00m graph_objects \u001b[38;5;28;01mas\u001b[39;00m go\n\u001b[32m    563\u001b[39m \n\u001b[32m    564\u001b[39m         \u001b[38;5;66;03m# load genotype allele counts at SNP variants for each sample\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m565\u001b[39m         df_haps = self.transcript_haplotypes(\n\u001b[32m    566\u001b[39m             transcript=transcript,\n\u001b[32m    567\u001b[39m             snp_query=snp_query,\n\u001b[32m    568\u001b[39m             sample_query=sample_query,\n",
-      "\u001b[32m~/Software/malariagen-data-python/malariagen_data/anoph/hapclust.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, transcript, sample_sets, sample_query, analysis, snp_query, chunks, inline_array)\u001b[39m\n\u001b[32m    529\u001b[39m         sample_ids = ds_haps[\u001b[33m\"sample_id\"\u001b[39m].values\n\u001b[32m    530\u001b[39m \n\u001b[32m    531\u001b[39m         \u001b[38;5;66;03m# Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff\u001b[39;00m\n\u001b[32m    532\u001b[39m         df_eff = df_eff.query(\u001b[33m\"pos_alt in @h_pos_alts\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m533\u001b[39m         label = df_eff.label.values\n\u001b[32m    534\u001b[39m         haps_bool = np.isin(h_pos_alts, df_eff.pos_alt)\n\u001b[32m    535\u001b[39m         haps = haps.compress(haps_bool)\n\u001b[32m    536\u001b[39m \n",
-      "\u001b[32m~/apps/miniforge3/lib/python3.12/site-packages/pandas/core/generic.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m   6295\u001b[39m             \u001b[38;5;28;01mand\u001b[39;00m name \u001b[38;5;28;01mnot\u001b[39;00m \u001b[38;5;28;01min\u001b[39;00m self._accessors\n\u001b[32m   6296\u001b[39m             \u001b[38;5;28;01mand\u001b[39;00m self._info_axis._can_hold_identifiers_and_holds_name(name)\n\u001b[32m   6297\u001b[39m         ):\n\u001b[32m   6298\u001b[39m             \u001b[38;5;28;01mreturn\u001b[39;00m self[name]\n\u001b[32m-> \u001b[39m\u001b[32m6299\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m object.__getattribute__(self, name)\n",
-      "\u001b[31mAttributeError\u001b[39m: 'DataFrame' object has no attribute 'label'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "ag3.plot_haplotype_clustering_advanced(\n",
-    "    region=\"2R:28,480,000-28,490,000\",\n",
-    "    sample_sets=[\"3.0\"],\n",
-    "    sample_query=\"taxon == 'arabiensis'\",\n",
-    "    analysis=\"gamb_colu_arab\",\n",
-    "    # symbol=new_cohorts,\n",
-    "    color=\"year\",\n",
+    "    region=\"2L:2,360,000-2,431,000\",\n",
+    "    sample_sets=[\"AG1000G-GH\"],\n",
+    "    analysis=\"gamb_colu\",\n",
+    "    color=\"taxon\",\n",
     "    cohort_size=None,\n",
-    "    snp_transcript=\"AGAP002862-RA\",\n",
-    "    # width=1000,\n",
-    "    # height=400,\n",
+    "    snp_transcript=\"AGAP004707-RD\",\n",
+    "    distance_metric='dxy',\n",
+    "    cluster_criterion='distance',\n",
+    "    cluster_threshold=0.0007,\n",
+    "    min_cluster_size=5,\n",
+    "    snp_filter_min_maf=0.05,\n",
+    "    show=True\n",
     ")"
    ]
   },
@@ -600,7 +445,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },

From e7adb65e222575ab7b9ac1279132fcc1a7124ca0 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 7 Sep 2025 16:34:25 +0100
Subject: [PATCH 05/15] mypy fix

---
 malariagen_data/anoph/hapclust_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/malariagen_data/anoph/hapclust_params.py b/malariagen_data/anoph/hapclust_params.py
index 08bc67351..71c5c2b4f 100644
--- a/malariagen_data/anoph/hapclust_params.py
+++ b/malariagen_data/anoph/hapclust_params.py
@@ -14,4 +14,4 @@
     """,
 ]
 
-distance_metric_default: str = "hamming"
+distance_metric_default: Literal["hamming", "dxy"] = "hamming"

From 9741b730e00e03dbc213ef37d3cf74f9caa3620b Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 7 Sep 2025 16:52:05 +0100
Subject: [PATCH 06/15] tests fix hapclust output type

---
 malariagen_data/anoph/hapclust.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 1a6532ee6..c90268c01 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -68,7 +68,7 @@ def plot_haplotype_clustering(
         legend_sizing: plotly_params.legend_sizing = "constant",
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-    ) -> plotly_params.figure:
+    ) -> Optional[dict]:
         import sys
 
         # Normalise params.

From 50a8c228f60b4eb2281f3fd880d330faef2ef75e Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Mon, 8 Sep 2025 10:33:13 +0100
Subject: [PATCH 07/15] add docstring and typing

---
 malariagen_data/anoph/dipclust.py |  2 +
 malariagen_data/anoph/hapclust.py | 92 ++++++++++++++++++-------------
 2 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py
index 043019ec4..82d99b758 100644
--- a/malariagen_data/anoph/dipclust.py
+++ b/malariagen_data/anoph/dipclust.py
@@ -742,6 +742,7 @@ def plot_diplotype_clustering_advanced(
                 dendro_sample_id_order=dendro_sample_id_order,
                 snp_filter_min_maf=snp_filter_min_maf,
                 snp_colorscale=snp_colorscale,
+                snp_row_height=snp_row_height,
                 chunks=chunks,
                 inline_array=inline_array,
             )
@@ -759,6 +760,7 @@ def plot_diplotype_clustering_advanced(
                     dendro_sample_id_order=dendro_sample_id_order,
                     snp_filter_min_maf=snp_filter_min_maf,
                     snp_colorscale=snp_colorscale,
+                    snp_row_height=snp_row_height,
                     chunks=chunks,
                     inline_array=inline_array,
                 )
diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index c90268c01..5000cd60e 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -14,6 +14,7 @@
     hap_params,
     clustering_params,
     hapclust_params,
+    dipclust_params,
 )
 from .snp_data import AnophelesSnpData
 from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect
@@ -323,46 +324,60 @@ def _haplotype_pairwise_distances(
             n_seg_sites=np.array(ht.shape[0]),
         )
 
+    @check_types
+    @doc(
+        summary="""
+            Hierarchically cluster haplotypes in region, and produce an interactive plot
+            with optional SNP haplotype heatmap and/or cluster assignments.
+        """,
+        returns="""
+            If `show` is False, returns a tuple (fig, leaf_data, df_haps) where
+            `fig` is a plotly Figure object, `leaf_data` is a DataFrame with
+            metadata for each haplotype in the dendrogram, and `df_haps` is a DataFrame
+            of haplotype calls for each sample at each SNP in the specified transcript.
+            If `show` is True, displays the figure and returns None.
+        """,
+    )
     def plot_haplotype_clustering_advanced(
         self,
-        region,
-        analysis,
-        snp_transcript=None,
-        snp_colorscale="Greys",
-        snp_filter_min_maf=0.05,
+        region: base_params.regions,
+        analysis: hap_params.analysis = base_params.DEFAULT,
+        snp_transcript: dipclust_params.snp_transcript = None,
+        snp_colorscale: dipclust_params.snp_colorscale = "Greys",
+        snp_filter_min_maf: dipclust_params.snp_filter_min_maf = 0.05,
         snp_query=AA_CHANGE_QUERY,
-        sample_sets=None,
-        sample_query=None,
-        sample_query_options=None,
-        random_seed=42,
-        cohort_size=None,
-        distance_metric="hamming",
-        cluster_threshold=None,
-        min_cluster_size=None,
+        sample_sets: Optional[base_params.sample_sets] = None,
+        sample_query: Optional[base_params.sample_query] = None,
+        sample_query_options: Optional[base_params.sample_query_options] = None,
+        random_seed: base_params.random_seed = 42,
+        cohort_size: Optional[base_params.cohort_size] = None,
+        distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default,
+        cluster_threshold: Optional[float] = None,
+        min_cluster_size: Optional[int] = None,
         cluster_criterion="distance",
-        color=None,
-        symbol=None,
-        linkage_method="complete",
-        count_sort=None,
-        distance_sort=None,
-        title=True,
-        title_font_size=14,
-        width=None,
-        dendrogram_height=300,
-        snp_row_height=25,
-        show=True,
-        renderer=None,
-        render_mode="svg",
-        leaf_y=0,
-        marker_size=5,
-        line_width=0.5,
-        line_color="black",
-        color_discrete_sequence=None,
-        color_discrete_map=None,
-        category_orders=None,
-        legend_sizing="constant",
-        chunks="native",
-        inline_array=True,
+        color: plotly_params.color = None,
+        symbol: plotly_params.symbol = None,
+        linkage_method: dipclust_params.linkage_method = "complete",
+        count_sort: Optional[tree_params.count_sort] = None,
+        distance_sort: Optional[tree_params.distance_sort] = None,
+        title: plotly_params.title = True,
+        title_font_size: plotly_params.title_font_size = 14,
+        width: plotly_params.fig_width = None,
+        dendrogram_height: plotly_params.height = 300,
+        snp_row_height: plotly_params.height = 25,
+        show: plotly_params.show = True,
+        renderer: plotly_params.renderer = None,
+        render_mode: plotly_params.render_mode = "svg",
+        leaf_y: plotly_params.leaf_y = 0,
+        marker_size: plotly_params.marker_size = 5,
+        line_width: plotly_params.line_width = 0.5,
+        line_color: plotly_params.line_color = "black",
+        color_discrete_sequence: plotly_params.color_discrete_sequence = None,
+        color_discrete_map: plotly_params.color_discrete_map = None,
+        category_orders: plotly_params.category_order = None,
+        legend_sizing: plotly_params.legend_sizing = "constant",
+        chunks: base_params.chunks = base_params.native_chunks,
+        inline_array: base_params.inline_array = base_params.inline_array_default,
     ):
         import plotly.express as px
         import plotly.graph_objects as go
@@ -565,7 +580,6 @@ def transcript_haplotypes(
         h_pos = allel.SortedIndex(ds_haps["variant_position"].values)
         h_alts = ds_haps["variant_allele"].values.astype(str)[:, 1]
         h_pos_alts = np.array([f"{pos}-{h_alts[i]}" for i, pos in enumerate(h_pos)])
-        sample_ids = ds_haps["sample_id"].values
 
         # Filter df_eff to haplotypes, and filter haplotypes to SNPs present in df_eff
         df_eff = df_eff.query("pos_alt in @h_pos_alts")
@@ -577,7 +591,9 @@ def transcript_haplotypes(
         df_haps = pd.DataFrame(
             haps,
             index=label,
-            columns=_make_unique(np.repeat(sample_ids, 2)),  # two haplotypes per sample
+            columns=_make_unique(
+                np.repeat(ds_haps["sample_id"].values, 2)
+            ),  # two haplotypes per sample
         )
 
         return df_haps

From 69359669da3a4e59fe628c3b71aeb7ddb18d515c Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Mon, 8 Sep 2025 10:35:58 +0100
Subject: [PATCH 08/15] fix snp_colorscale type

---
 malariagen_data/anoph/hapclust.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 5000cd60e..8f69d870e 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -343,7 +343,7 @@ def plot_haplotype_clustering_advanced(
         region: base_params.regions,
         analysis: hap_params.analysis = base_params.DEFAULT,
         snp_transcript: dipclust_params.snp_transcript = None,
-        snp_colorscale: dipclust_params.snp_colorscale = "Greys",
+        snp_colorscale: plotly_params.snp_colorscale = "Greys",
         snp_filter_min_maf: dipclust_params.snp_filter_min_maf = 0.05,
         snp_query=AA_CHANGE_QUERY,
         sample_sets: Optional[base_params.sample_sets] = None,

From 14b58462be2f8f77b9216a80ed0a7c8d65e695bb Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Mon, 8 Sep 2025 10:42:28 +0100
Subject: [PATCH 09/15] fix errors

---
 malariagen_data/anoph/hapclust.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 8f69d870e..a172bba62 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -19,9 +19,12 @@
 from .snp_data import AnophelesSnpData
 from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect
 from .hap_data import AnophelesHapData
+from .dipclust import AnophelesDipClustAnalysis
 
 
-class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData):
+class AnophelesHapClustAnalysis(
+    AnophelesHapData, AnophelesSnpData, AnophelesDipClustAnalysis
+):
     def __init__(
         self,
         **kwargs,
@@ -342,9 +345,9 @@ def plot_haplotype_clustering_advanced(
         self,
         region: base_params.regions,
         analysis: hap_params.analysis = base_params.DEFAULT,
-        snp_transcript: dipclust_params.snp_transcript = None,
-        snp_colorscale: plotly_params.snp_colorscale = "Greys",
-        snp_filter_min_maf: dipclust_params.snp_filter_min_maf = 0.05,
+        snp_transcript: Optional[dipclust_params.snp_transcript] = None,
+        snp_colorscale: Optional[plotly_params.color_continuous_scale] = "Greys",
+        snp_filter_min_maf: float = 0.05,
         snp_query=AA_CHANGE_QUERY,
         sample_sets: Optional[base_params.sample_sets] = None,
         sample_query: Optional[base_params.sample_query] = None,
@@ -368,7 +371,7 @@ def plot_haplotype_clustering_advanced(
         show: plotly_params.show = True,
         renderer: plotly_params.renderer = None,
         render_mode: plotly_params.render_mode = "svg",
-        leaf_y: plotly_params.leaf_y = 0,
+        leaf_y: clustering_params.leaf_y = 0,
         marker_size: plotly_params.marker_size = 5,
         line_width: plotly_params.line_width = 0.5,
         line_color: plotly_params.line_color = "black",

From a878727c8a2fae4e47c83b54d5437d19a390f824 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Mon, 8 Sep 2025 10:47:48 +0100
Subject: [PATCH 10/15] fix doc

---
 malariagen_data/anoph/hapclust.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index a172bba62..7f0c8c9ce 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -340,6 +340,10 @@ def _haplotype_pairwise_distances(
             of haplotype calls for each sample at each SNP in the specified transcript.
             If `show` is True, displays the figure and returns None.
         """,
+        parameters=dict(
+            snp_transcript="Plot amino acid variants for these transcripts.",
+            snp_filter_min_maf="Filter amino acid variants with alternate allele frequency below this threshold.",
+        ),
     )
     def plot_haplotype_clustering_advanced(
         self,

From 36f6d5156fd4941bd59b03ce455599de8f36464b Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Mon, 8 Sep 2025 11:02:24 +0100
Subject: [PATCH 11/15] move concat_subplots outside of dipclust.py to
 plotly_dendrogram.py

---
 malariagen_data/anoph/dipclust.py    | 54 ++--------------------------
 malariagen_data/anoph/hapclust.py    | 15 ++++----
 malariagen_data/plotly_dendrogram.py | 51 ++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 59 deletions(-)

diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py
index 82d99b758..bcb14b6bf 100644
--- a/malariagen_data/anoph/dipclust.py
+++ b/malariagen_data/anoph/dipclust.py
@@ -13,7 +13,7 @@
     multiallelic_diplotype_mean_sqeuclidean,
     multiallelic_diplotype_mean_cityblock,
 )
-from ..plotly_dendrogram import plot_dendrogram
+from ..plotly_dendrogram import plot_dendrogram, concat_clustering_subplots
 from . import (
     base_params,
     plotly_params,
@@ -499,56 +499,6 @@ def _dipclust_snp_trace(
 
         return snp_trace, n_snps
 
-    def _dipclust_concat_subplots(
-        self,
-        figures,
-        width,
-        height,
-        row_heights,
-        region: base_params.regions,
-        n_snps: int,
-        sample_sets: Optional[base_params.sample_sets],
-        sample_query: Optional[base_params.sample_query],
-    ):
-        from plotly.subplots import make_subplots  # type: ignore
-
-        title_lines = []
-        if sample_sets is not None:
-            title_lines.append(f"sample sets: {sample_sets}")
-        if sample_query is not None:
-            title_lines.append(f"sample query: {sample_query}")
-        title_lines.append(f"genomic region: {region} ({n_snps} SNPs)")
-        title = "<br>".join(title_lines)
-
-        # make subplots
-        fig = make_subplots(
-            rows=len(figures),
-            cols=1,
-            shared_xaxes=True,
-            vertical_spacing=0.02,
-            row_heights=row_heights,
-        )
-
-        for i, figure in enumerate(figures):
-            if isinstance(figure, go.Figure):
-                # This is a figure, access the traces within it.
-                for trace in range(len(figure["data"])):
-                    fig.append_trace(figure["data"][trace], row=i + 1, col=1)
-            else:
-                # Assume this is a trace, add directly.
-                fig.append_trace(figure, row=i + 1, col=1)
-
-        fig.update_xaxes(visible=False)
-        fig.update_layout(
-            title=title,
-            width=width,
-            height=height,
-            hovermode="closest",
-            plot_bgcolor="white",
-        )
-
-        return fig
-
     def _insert_dipclust_snp_trace(
         self,
         *,
@@ -768,7 +718,7 @@ def plot_diplotype_clustering_advanced(
         # Calculate total height based on subplot heights, plus a fixed
         # additional component to allow for title, axes etc.
         height = sum(subplot_heights) + 50
-        fig = self._dipclust_concat_subplots(
+        fig = concat_clustering_subplots(
             figures=figures,
             width=width,
             height=height,
diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 7f0c8c9ce..c978eb18e 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -6,7 +6,7 @@
 from numpydoc_decorator import doc  # type: ignore
 
 from ..util import CacheMiss, check_types, pdist_abs_hamming, pandas_apply
-from ..plotly_dendrogram import plot_dendrogram
+from ..plotly_dendrogram import plot_dendrogram, concat_clustering_subplots
 from . import (
     base_params,
     plotly_params,
@@ -19,12 +19,9 @@
 from .snp_data import AnophelesSnpData
 from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect
 from .hap_data import AnophelesHapData
-from .dipclust import AnophelesDipClustAnalysis
 
 
-class AnophelesHapClustAnalysis(
-    AnophelesHapData, AnophelesSnpData, AnophelesDipClustAnalysis
-):
+class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData):
     def __init__(
         self,
         **kwargs,
@@ -343,6 +340,10 @@ def _haplotype_pairwise_distances(
         parameters=dict(
             snp_transcript="Plot amino acid variants for these transcripts.",
             snp_filter_min_maf="Filter amino acid variants with alternate allele frequency below this threshold.",
+            snp_query="Query to filter SNPs for amino acid heatmap. Default is to include all non-synonymous SNPs.",
+            cluster_threshold="Height at which to cut the dendrogram to form clusters. If not provided, no clusters assignment is not performed.",
+            min_cluster_size="Minimum number of haplotypes required in a cluster to be included when cutting the dendrogram. Default is 5.",
+            cluster_criterion="The cluster_criterion to use in forming flat clusters. One of 'inconsistent', 'distance', 'maxclust', 'maxclust_monochronic', 'monocrit'. See scipy.cluster.hierarchy.fcluster for details.",
         ),
     )
     def plot_haplotype_clustering_advanced(
@@ -360,7 +361,7 @@ def plot_haplotype_clustering_advanced(
         cohort_size: Optional[base_params.cohort_size] = None,
         distance_metric: hapclust_params.distance_metric = hapclust_params.distance_metric_default,
         cluster_threshold: Optional[float] = None,
-        min_cluster_size: Optional[int] = None,
+        min_cluster_size: Optional[int] = 5,
         cluster_criterion="distance",
         color: plotly_params.color = None,
         symbol: plotly_params.symbol = None,
@@ -487,7 +488,7 @@ def plot_haplotype_clustering_advanced(
         # Calculate total height based on subplot heights, plus a fixed
         # additional component to allow for title, axes etc.
         height = sum(subplot_heights) + 50
-        fig = self._dipclust_concat_subplots(
+        fig = concat_clustering_subplots(
             figures=figures,
             width=width,
             height=height,
diff --git a/malariagen_data/plotly_dendrogram.py b/malariagen_data/plotly_dendrogram.py
index a172c469b..8a94907d1 100644
--- a/malariagen_data/plotly_dendrogram.py
+++ b/malariagen_data/plotly_dendrogram.py
@@ -130,3 +130,54 @@ def plot_dendrogram(
     )
 
     return fig, leaf_data
+
+
+def concat_clustering_subplots(
+    figures,
+    width,
+    height,
+    row_heights,
+    region,
+    n_snps,
+    sample_sets,
+    sample_query,
+):
+    from plotly.subplots import make_subplots  # type: ignore
+    import plotly.graph_objects as go  # type: ignore
+
+    title_lines = []
+    if sample_sets is not None:
+        title_lines.append(f"sample sets: {sample_sets}")
+    if sample_query is not None:
+        title_lines.append(f"sample query: {sample_query}")
+    title_lines.append(f"genomic region: {region} ({n_snps} SNPs)")
+    title = "<br>".join(title_lines)
+
+    # make subplots
+    fig = make_subplots(
+        rows=len(figures),
+        cols=1,
+        shared_xaxes=True,
+        vertical_spacing=0.02,
+        row_heights=row_heights,
+    )
+
+    for i, figure in enumerate(figures):
+        if isinstance(figure, go.Figure):
+            # This is a figure, access the traces within it.
+            for trace in range(len(figure["data"])):
+                fig.append_trace(figure["data"][trace], row=i + 1, col=1)
+        else:
+            # Assume this is a trace, add directly.
+            fig.append_trace(figure, row=i + 1, col=1)
+
+    fig.update_xaxes(visible=False)
+    fig.update_layout(
+        title=title,
+        width=width,
+        height=height,
+        hovermode="closest",
+        plot_bgcolor="white",
+    )
+
+    return fig

From 5a273d0c97459c2a638b51596d44ce33ad791b5e Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Mon, 8 Sep 2025 14:31:35 +0100
Subject: [PATCH 12/15] add support for multiple transcripts, lines

---
 malariagen_data/anoph/dipclust.py |  55 ++++++++++++-
 malariagen_data/anoph/hapclust.py | 126 ++++++++++++++++++++----------
 2 files changed, 136 insertions(+), 45 deletions(-)

diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py
index bcb14b6bf..825c92861 100644
--- a/malariagen_data/anoph/dipclust.py
+++ b/malariagen_data/anoph/dipclust.py
@@ -538,7 +538,7 @@ def _insert_dipclust_snp_trace(
             print(
                 f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
             )
-        return figures, subplot_heights
+        return figures, subplot_heights, n_snps_transcript
 
     @doc(
         summary=""""
@@ -679,8 +679,13 @@ def plot_diplotype_clustering_advanced(
                 figures.append(cnv_trace)
                 subplot_heights.append(cnv_row_height * n_cnv_genes)
 
+        n_snps_transcripts = []
         if isinstance(snp_transcript, str):
-            figures, subplot_heights = self._insert_dipclust_snp_trace(
+            (
+                figures,
+                subplot_heights,
+                n_snps_transcript,
+            ) = self._insert_dipclust_snp_trace(
                 transcript=snp_transcript,
                 figures=figures,
                 subplot_heights=subplot_heights,
@@ -696,9 +701,14 @@ def plot_diplotype_clustering_advanced(
                 chunks=chunks,
                 inline_array=inline_array,
             )
+            n_snps_transcripts.append(n_snps_transcript)
         elif isinstance(snp_transcript, list):
             for st in snp_transcript:
-                figures, subplot_heights = self._insert_dipclust_snp_trace(
+                (
+                    figures,
+                    subplot_heights,
+                    n_snps_transcript,
+                ) = self._insert_dipclust_snp_trace(
                     transcript=st,
                     figures=figures,
                     subplot_heights=subplot_heights,
@@ -714,6 +724,7 @@ def plot_diplotype_clustering_advanced(
                     chunks=chunks,
                     inline_array=inline_array,
                 )
+                n_snps_transcripts.append(n_snps_transcript)
 
         # Calculate total height based on subplot heights, plus a fixed
         # additional component to allow for title, axes etc.
@@ -739,6 +750,44 @@ def plot_diplotype_clustering_advanced(
             legend=dict(itemsizing=legend_sizing, tracegroupgap=0),
         )
 
+        # add lines to aa plot - looks neater
+        if snp_transcript:
+            n_transcripts = (
+                len(snp_transcript) if isinstance(snp_transcript, list) else 1
+            )
+            for i in range(n_transcripts):
+                tx_idx = len(figures) - n_transcripts + i + 1
+                if n_snps_transcripts[i] > 0:
+                    fig.add_hline(
+                        y=-0.5, line_width=1, line_color="grey", row=tx_idx, col=1
+                    )
+                    for j in range(n_snps_transcripts[i]):
+                        fig.add_hline(
+                            y=j + 0.5,
+                            line_width=1,
+                            line_color="grey",
+                            row=tx_idx,
+                            col=1,
+                        )
+
+                fig.update_xaxes(
+                    showline=True,
+                    linecolor="grey",
+                    linewidth=1,
+                    row=tx_idx,
+                    col=1,
+                    mirror=True,
+                )
+
+                fig.update_yaxes(
+                    showline=True,
+                    linecolor="grey",
+                    linewidth=1,
+                    row=tx_idx,
+                    col=1,
+                    mirror=True,
+                )
+
         if show:
             fig.show(renderer=renderer)
             return None
diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index c978eb18e..cf9f0ef1b 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -468,22 +468,50 @@ def plot_haplotype_clustering_advanced(
             figures.append(snp_trace)
             subplot_heights.append(25)
 
-        figures, subplot_heights, df_haps = self._insert_hapclust_snp_trace(
-            transcript=snp_transcript,
-            snp_query=snp_query,
-            figures=figures,
-            subplot_heights=subplot_heights,
-            sample_sets=sample_sets,
-            sample_query=sample_query,
-            analysis=analysis,
-            dendro_sample_id_order=dendro_sample_id_order,
-            snp_filter_min_maf=snp_filter_min_maf,
-            snp_colorscale=snp_colorscale,
-            snp_row_height=snp_row_height,
-            chunks=chunks,
-            inline_array=inline_array,
-        )
-        n_aa = df_haps.shape[0]
+        n_snps_transcripts = []
+        if isinstance(snp_transcript, str):
+            (
+                figures,
+                subplot_heights,
+                n_snps_transcript,
+            ) = self._insert_hapclust_snp_trace(
+                transcript=snp_transcript,
+                snp_query=snp_query,
+                figures=figures,
+                subplot_heights=subplot_heights,
+                sample_sets=sample_sets,
+                sample_query=sample_query,
+                analysis=analysis,
+                dendro_sample_id_order=dendro_sample_id_order,
+                snp_filter_min_maf=snp_filter_min_maf,
+                snp_colorscale=snp_colorscale,
+                snp_row_height=snp_row_height,
+                chunks=chunks,
+                inline_array=inline_array,
+            )
+            n_snps_transcripts.append(n_snps_transcript)
+        elif isinstance(snp_transcript, list):
+            for st in snp_transcript:
+                (
+                    figures,
+                    subplot_heights,
+                    n_snps_transcript,
+                ) = self._insert_hapclust_snp_trace(
+                    transcript=st,
+                    snp_query=snp_query,
+                    figures=figures,
+                    subplot_heights=subplot_heights,
+                    sample_sets=sample_sets,
+                    sample_query=sample_query,
+                    analysis=analysis,
+                    dendro_sample_id_order=dendro_sample_id_order,
+                    snp_filter_min_maf=snp_filter_min_maf,
+                    snp_colorscale=snp_colorscale,
+                    snp_row_height=snp_row_height,
+                    chunks=chunks,
+                    inline_array=inline_array,
+                )
+                n_snps_transcripts.append(n_snps_transcript)
 
         # Calculate total height based on subplot heights, plus a fixed
         # additional component to allow for title, axes etc.
@@ -499,7 +527,7 @@ def plot_haplotype_clustering_advanced(
             n_snps=n_snps_cluster,
         )
 
-        fig["layout"]["yaxis"]["title"] = "Distance"
+        fig["layout"]["yaxis"]["title"] = f"Distance ({distance_metric})"
         fig.update_layout(
             title_font=dict(
                 size=title_font_size,
@@ -507,37 +535,49 @@ def plot_haplotype_clustering_advanced(
             legend=dict(itemsizing=legend_sizing, tracegroupgap=0),
         )
 
-        if snp_transcript and n_aa > 0:
-            # add lines to aa plot
-            aa_idx = len(figures)
-            fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1)
-            for i in range(n_aa):
-                fig.add_hline(
-                    y=i + 0.5, line_width=1, line_color="grey", row=aa_idx, col=1
+        # add lines to aa plot - looks neater
+        if snp_transcript:
+            n_transcripts = (
+                len(snp_transcript) if isinstance(snp_transcript, list) else 1
+            )
+            for i in range(n_transcripts):
+                tx_idx = len(figures) - n_transcripts + i + 1
+                if n_snps_transcripts[i] > 0:
+                    fig.add_hline(
+                        y=-0.5, line_width=1, line_color="grey", row=tx_idx, col=1
+                    )
+                    for j in range(n_snps_transcripts[i]):
+                        fig.add_hline(
+                            y=j + 0.5,
+                            line_width=1,
+                            line_color="grey",
+                            row=tx_idx,
+                            col=1,
+                        )
+
+                fig.update_xaxes(
+                    showline=True,
+                    linecolor="grey",
+                    linewidth=1,
+                    row=tx_idx,
+                    col=1,
+                    mirror=True,
                 )
 
-            fig.update_xaxes(
-                showline=True,
-                linecolor="grey",
-                linewidth=1,
-                row=aa_idx,
-                col=1,
-                mirror=True,
-            )
-            fig.update_yaxes(
-                showline=True,
-                linecolor="grey",
-                linewidth=1,
-                row=aa_idx,
-                col=1,
-                mirror=True,
-            )
+                fig.update_yaxes(
+                    showline=True,
+                    linecolor="grey",
+                    linewidth=1,
+                    row=tx_idx,
+                    col=1,
+                    mirror=True,
+                )
 
         if show:
             fig.show(renderer=renderer)
             return None
         else:
-            return fig, leaf_data, df_haps
+            return fig, leaf_data
 
     def transcript_haplotypes(
         self,
@@ -642,6 +682,8 @@ def _insert_hapclust_snp_trace(
             df_haps = df_haps.assign(af=lambda x: x.sum(axis=1) / x.shape[1])
             df_haps = df_haps.query("af > @snp_filter_min_maf").drop(columns="af")
 
+        n_snps_transcript = df_haps.shape[0]
+
         if not df_haps.empty:
             snp_trace = go.Heatmap(
                 z=df_haps.values,
@@ -660,7 +702,7 @@ def _insert_hapclust_snp_trace(
             print(
                 f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
             )
-        return figures, subplot_heights, df_haps
+        return figures, subplot_heights, n_snps_transcript
 
     def cut_dist_tree(
         self,

From 95923d667d0ee017d68174f40c99bf719b922c24 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Mon, 8 Sep 2025 15:06:32 +0100
Subject: [PATCH 13/15] rename cluster_id

---
 malariagen_data/anoph/hapclust.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index cf9f0ef1b..ccc4f389f 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -756,7 +756,7 @@ def cut_dist_tree(
             pd.DataFrame(
                 {
                     "sample_id": dist_samples,
-                    "cluster_id": _filter_and_remap(
+                    "Cluster ID": _filter_and_remap(
                         cluster_assignments, x=min_cluster_size
                     ),
                 }

From 7b969710573048de1b6b2582c51b10065c6796b2 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Fri, 19 Sep 2025 15:48:19 +0100
Subject: [PATCH 14/15] add anophFreqAnalysis class to hapclust

---
 malariagen_data/anoph/hapclust.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index ccc4f389f..571fd06b3 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -17,11 +17,11 @@
     dipclust_params,
 )
 from .snp_data import AnophelesSnpData
-from .snp_frq import AA_CHANGE_QUERY, _make_snp_label_effect
+from .snp_frq import AnophelesFrequencyAnalysis, AA_CHANGE_QUERY, _make_snp_label_effect
 from .hap_data import AnophelesHapData
 
 
-class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData):
+class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData, AnophelesFrequencyAnalysis):
     def __init__(
         self,
         **kwargs,

From 094bc065bf9c1623b8ce22b5d326ba5fd43af068 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Fri, 19 Sep 2025 15:51:05 +0100
Subject: [PATCH 15/15] pre-commit lints

---
 malariagen_data/anoph/hapclust.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
index 571fd06b3..792b33509 100644
--- a/malariagen_data/anoph/hapclust.py
+++ b/malariagen_data/anoph/hapclust.py
@@ -21,7 +21,9 @@
 from .hap_data import AnophelesHapData
 
 
-class AnophelesHapClustAnalysis(AnophelesHapData, AnophelesSnpData, AnophelesFrequencyAnalysis):
+class AnophelesHapClustAnalysis(
+    AnophelesHapData, AnophelesSnpData, AnophelesFrequencyAnalysis
+):
     def __init__(
         self,
         **kwargs,