From 23105a43232220c70d37d1f2180b4c7c246ad8c0 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 6 Dec 2024 17:58:51 +0000 Subject: [PATCH 01/21] Replace contig with region in H12 GWSS functions and tests --- malariagen_data/anoph/h12.py | 50 ++++++++++++++++++------------------ notebooks/plot_h12_h1x.ipynb | 31 ++++++++++------------ tests/anoph/test_h12.py | 16 ++++++------ 3 files changed, 46 insertions(+), 51 deletions(-) diff --git a/malariagen_data/anoph/h12.py b/malariagen_data/anoph/h12.py index fc3df603f..7b749ccf7 100644 --- a/malariagen_data/anoph/h12.py +++ b/malariagen_data/anoph/h12.py @@ -25,7 +25,7 @@ def __init__( def _h12_calibration( self, - contig, + region, analysis, sample_query, sample_query_options, @@ -39,7 +39,7 @@ def _h12_calibration( inline_array, ) -> Mapping[str, np.ndarray]: ds_haps = self.haplotypes( - region=contig, + region=region, sample_sets=sample_sets, sample_query=sample_query, sample_query_options=sample_query_options, @@ -73,7 +73,7 @@ def _h12_calibration( ) def h12_calibration( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_query: Optional[base_params.sample_query] = None, sample_query_options: Optional[base_params.sample_query_options] = None, @@ -95,7 +95,7 @@ def h12_calibration( name = "h12_calibration_v1" params = dict( - contig=contig, + region=region, analysis=self._prep_phasing_analysis_param(analysis=analysis), window_sizes=window_sizes, sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets), @@ -131,7 +131,7 @@ def h12_calibration( ) def plot_h12_calibration( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_query: Optional[base_params.sample_query] = None, sample_query_options: Optional[base_params.sample_query_options] = None, @@ -152,7 +152,7 @@ def plot_h12_calibration( ) -> gplt_params.optional_figure: # Get H12 values. calibration_runs = self.h12_calibration( - contig=contig, + region=region, analysis=analysis, sample_query=sample_query, sample_query_options=sample_query_options, @@ -227,7 +227,7 @@ def plot_h12_calibration( def _h12_gwss( self, - contig, + region, analysis, window_size, sample_sets, @@ -241,7 +241,7 @@ def _h12_gwss( inline_array, ): ds_haps = self.haplotypes( - region=contig, + region=region, analysis=analysis, sample_query=sample_query, sample_query_options=sample_query_options, @@ -289,7 +289,7 @@ def _h12_gwss( ) def h12_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: h12_params.window_size, analysis: hap_params.analysis = base_params.DEFAULT, sample_query: Optional[base_params.sample_query] = None, @@ -308,10 +308,10 @@ def h12_gwss( ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: # Change this name if you ever change the behaviour of this function, to # invalidate any previously cached data. - name = "h12_gwss_v2" + name = "h12_gwss_v3" params = dict( - contig=contig, + region=region, analysis=self._prep_phasing_analysis_param(analysis=analysis), window_size=window_size, sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets), @@ -345,7 +345,7 @@ def h12_gwss( ) def plot_h12_gwss_track( self, - contig: base_params.contig, + region: base_params.region, window_size: h12_params.window_size, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, @@ -372,7 +372,7 @@ def plot_h12_gwss_track( ) -> gplt_params.optional_figure: # Compute H12. x, h12, contigs = self.h12_gwss( - contig=contig, + region=region, analysis=analysis, window_size=window_size, cohort_size=cohort_size, @@ -434,7 +434,7 @@ def plot_h12_gwss_track( # Tidy up the plot. fig.yaxis.axis_label = "H12" fig.yaxis.ticker = [0, 1] - self._bokeh_style_genome_xaxis(fig, contig) + self._bokeh_style_genome_xaxis(fig, region) if show: # pragma: no cover bokeh.plotting.show(fig) @@ -448,7 +448,7 @@ def plot_h12_gwss_track( ) def plot_h12_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: h12_params.window_size, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, @@ -477,7 +477,7 @@ def plot_h12_gwss( ) -> gplt_params.optional_figure: # Plot GWSS track. fig1 = self.plot_h12_gwss_track( - contig=contig, + region=region, analysis=analysis, window_size=window_size, sample_sets=sample_sets, @@ -502,7 +502,7 @@ def plot_h12_gwss( # Plot genes. fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, @@ -535,7 +535,7 @@ def plot_h12_gwss( ) def plot_h12_gwss_multi_overlay_track( self, - contig: base_params.contig, + region: base_params.region, cohorts: base_params.cohorts, window_size: h12_params.multi_window_size, cohort_size: Optional[base_params.cohort_size] = h12_params.cohort_size_default, @@ -578,7 +578,7 @@ def plot_h12_gwss_multi_overlay_track( res = {} for cohort_label, cohort_query in cohort_queries.items(): res[cohort_label] = self.h12_gwss( - contig=contig, + region=region, analysis=analysis, window_size=window_size[cohort_label], cohort_size=cohort_size, @@ -654,7 +654,7 @@ def plot_h12_gwss_multi_overlay_track( ) def plot_h12_gwss_multi_overlay( self, - contig: base_params.contig, + region: base_params.region, cohorts: base_params.cohorts, window_size: h12_params.multi_window_size, cohort_size: Optional[base_params.cohort_size] = h12_params.cohort_size_default, @@ -682,7 +682,7 @@ def plot_h12_gwss_multi_overlay( ) -> gplt_params.optional_figure: # Plot GWSS track. fig1 = self.plot_h12_gwss_multi_overlay_track( - contig=contig, + region=region, sample_query=sample_query, cohorts=cohorts, cohort_size=cohort_size, @@ -708,7 +708,7 @@ def plot_h12_gwss_multi_overlay( # Plot genes. fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, @@ -741,7 +741,7 @@ def plot_h12_gwss_multi_overlay( ) def plot_h12_gwss_multi_panel( self, - contig: base_params.contig, + region: base_params.region, cohorts: base_params.cohorts, window_size: h12_params.multi_window_size, cohort_size: Optional[base_params.cohort_size] = h12_params.cohort_size_default, @@ -784,7 +784,7 @@ def plot_h12_gwss_multi_panel( figs: list[gplt_params.figure] = [] for i, (cohort_label, cohort_query) in enumerate(cohort_queries.items()): params = dict( - contig=contig, + region=region, analysis=analysis, window_size=window_size[cohort_label], sample_sets=sample_sets, @@ -809,7 +809,7 @@ def plot_h12_gwss_multi_panel( # Plot genes. fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, diff --git a/notebooks/plot_h12_h1x.ipynb b/notebooks/plot_h12_h1x.ipynb index 33e00f53a..e7b17b2bf 100644 --- a/notebooks/plot_h12_h1x.ipynb +++ b/notebooks/plot_h12_h1x.ipynb @@ -89,7 +89,7 @@ "outputs": [], "source": [ "ag3.plot_h12_calibration(\n", - " contig=contig,\n", + " region=contig,\n", " analysis=\"gamb_colu\",\n", " sample_query=coh1_query,\n", " sample_sets=\"3.0\",\n", @@ -106,7 +106,7 @@ "outputs": [], "source": [ "ag3.plot_h12_gwss(\n", - " contig=contig,\n", + " region=contig,\n", " analysis=\"gamb_colu\",\n", " window_size=2000,\n", " sample_query=coh1_query,\n", @@ -123,7 +123,7 @@ "outputs": [], "source": [ "ag3.plot_h12_gwss(\n", - " contig=contigs,\n", + " region=contigs,\n", " analysis=\"gamb_colu\",\n", " window_size=2000,\n", " sample_query=coh1_query,\n", @@ -142,7 +142,7 @@ "outputs": [], "source": [ "ag3.plot_h12_gwss(\n", - " contig=contig,\n", + " region=contig,\n", " analysis=\"gamb_colu\",\n", " window_size=2000,\n", " sample_query=coh1_query,\n", @@ -162,7 +162,7 @@ "outputs": [], "source": [ "ag3.plot_h12_gwss(\n", - " contig=contig,\n", + " region=contig,\n", " analysis=\"gamb_colu\",\n", " window_size=2000,\n", " sample_query=coh2_query,\n", @@ -232,7 +232,7 @@ "outputs": [], "source": [ "ag3.plot_h12_gwss_multi_overlay(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=2000,\n", " cohorts=\"admin2_year\",\n", " sample_sets=\"AG1000G-ML-A\",\n", @@ -251,7 +251,7 @@ "outputs": [], "source": [ "ag3.plot_h12_gwss_multi_panel(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=2000,\n", " cohorts=\"admin2_year\",\n", " sample_sets=\"AG1000G-ML-A\",\n", @@ -322,7 +322,7 @@ "outputs": [], "source": [ "af1.plot_h12_gwss(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=2000,\n", " sample_query=coh1_query,\n", " sample_sets=\"1.0\",\n", @@ -338,7 +338,7 @@ "outputs": [], "source": [ "af1.plot_h12_gwss(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=2000,\n", " sample_query=coh2_query,\n", " sample_sets=\"1.0\",\n", @@ -375,7 +375,7 @@ "outputs": [], "source": [ "af1.plot_h12_gwss_multi_overlay(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=window_size,\n", " cohorts=cohorts,\n", " sample_sets=\"1.0\",\n", @@ -393,7 +393,7 @@ "outputs": [], "source": [ "af1.plot_h12_gwss_multi_panel(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=window_size,\n", " cohorts=cohorts,\n", " sample_sets=\"1.0\",\n", @@ -427,7 +427,7 @@ "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m125" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "mgen_data_py3.11", "language": "python", "name": "python3" }, @@ -441,12 +441,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" - }, - "vscode": { - "interpreter": { - "hash": "3b9ddb1005cd06989fd869b9e3d566470f1be01faa610bb17d64e58e32302e8b" - } + "version": "3.11.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/tests/anoph/test_h12.py b/tests/anoph/test_h12.py index 26ff07147..4e62def5c 100644 --- a/tests/anoph/test_h12.py +++ b/tests/anoph/test_h12.py @@ -105,7 +105,7 @@ def test_h12_calibration(fixture, api: AnophelesH12Analysis): window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist() window_sizes = sorted(set([int(x) for x in window_sizes])) h12_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=[random.choice(all_sample_sets)], window_sizes=window_sizes, min_cohort_size=5, @@ -168,7 +168,7 @@ def test_h12_gwss_with_default_analysis(fixture, api: AnophelesH12Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() h12_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=5, @@ -197,7 +197,7 @@ def test_h12_gwss_with_analysis(fixture, api: AnophelesH12Analysis): # No samples available, check similar error raised from H12. with pytest.raises(ValueError): api.h12_gwss( - contig=contig, + region=contig, sample_sets=sample_sets, analysis=analysis, window_size=window_size, @@ -208,7 +208,7 @@ def test_h12_gwss_with_analysis(fixture, api: AnophelesH12Analysis): # Samples are available, run full checks. n_samples = ds_hap.sizes["samples"] h12_params = dict( - contig=contig, + region=contig, sample_sets=sample_sets, analysis=analysis, window_size=window_size, @@ -219,7 +219,7 @@ def test_h12_gwss_with_analysis(fixture, api: AnophelesH12Analysis): # Check min_cohort_size behaviour. with pytest.raises(ValueError): api.h12_gwss( - contig=contig, + region=contig, sample_sets=sample_sets, analysis=analysis, window_size=window_size, @@ -236,7 +236,7 @@ def test_h12_gwss_multi_with_default_analysis(fixture, api: AnophelesH12Analysis cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h12_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=1, @@ -256,7 +256,7 @@ def test_h12_gwss_multi_with_window_size_dict(fixture, api: AnophelesH12Analysis cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h12_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=all_sample_sets, window_size={ "cohort1": random.randint(100, 500), @@ -309,7 +309,7 @@ def test_h12_gwss_multi_with_analysis(fixture, api: AnophelesH12Analysis): # Samples are available, run full checks. h12_params = dict( analysis=analysis, - contig=contig, + region=contig, sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=min(n1, n2), From acdba78653dc631c3fd2db41cfa3a371b1242069 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 6 Dec 2024 18:21:28 +0000 Subject: [PATCH 02/21] Replace contig with region in G123 GWSS functions and tests --- malariagen_data/anoph/g123.py | 40 +++++++++++++++++----------------- notebooks/plot_g123_gwss.ipynb | 22 +++++++++---------- tests/anoph/test_g123.py | 12 +++++----- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/malariagen_data/anoph/g123.py b/malariagen_data/anoph/g123.py index dca996aa0..e08d62624 100644 --- a/malariagen_data/anoph/g123.py +++ b/malariagen_data/anoph/g123.py @@ -37,7 +37,7 @@ def __init__( def _load_data_for_g123( self, *, - contig, + region, sites, site_mask, sample_sets, @@ -50,7 +50,7 @@ def _load_data_for_g123( chunks, ): ds_snps = self.snp_calls( - region=contig, + region=region, sample_query=sample_query, sample_query_options=sample_query_options, sample_sets=sample_sets, @@ -74,7 +74,7 @@ def _load_data_for_g123( # of samples was used to set up the phasing analysis. with self._spinner("Subsetting to selected sites"): haplotype_pos = self.haplotype_sites( - region=contig, + region=region, analysis=sites, field="POS", inline_array=True, @@ -106,7 +106,7 @@ def _load_data_for_g123( def _g123_gwss( self, *, - contig, + region, sites, site_mask, window_size, @@ -120,7 +120,7 @@ def _g123_gwss( chunks, ): gt, pos = self._load_data_for_g123( - contig=contig, + region=region, sites=sites, site_mask=site_mask, sample_sets=sample_sets, @@ -151,7 +151,7 @@ def _g123_gwss( ) def g123_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: g123_params.window_size, sites: g123_params.sites = base_params.DEFAULT, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, @@ -170,7 +170,7 @@ def g123_gwss( ) -> Tuple[np.ndarray, np.ndarray]: # Change this name if you ever change the behaviour of this function, to # invalidate any previously cached data. - name = "g123_gwss_v1" + name = "g123_gwss_v2" if sites == base_params.DEFAULT: assert self._default_phasing_analysis is not None @@ -182,7 +182,7 @@ def g123_gwss( ) params = dict( - contig=contig, + region=region, sites=sites, site_mask=site_mask, window_size=window_size, @@ -214,7 +214,7 @@ def g123_gwss( def _g123_calibration( self, *, - contig, + region, sites, site_mask, sample_query, @@ -228,7 +228,7 @@ def _g123_calibration( chunks, ) -> Mapping[str, np.ndarray]: gt, _ = self._load_data_for_g123( - contig=contig, + region=region, sites=sites, site_mask=site_mask, sample_query=sample_query, @@ -258,7 +258,7 @@ def _g123_calibration( ) def g123_calibration( self, - contig: base_params.contig, + region: base_params.region, sites: g123_params.sites = base_params.DEFAULT, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, sample_query: Optional[base_params.sample_query] = None, @@ -280,7 +280,7 @@ def g123_calibration( name = "g123_calibration_v1" params = dict( - contig=contig, + region=region, sites=sites, site_mask=self._prep_optional_site_mask_param(site_mask=site_mask), window_sizes=window_sizes, @@ -312,7 +312,7 @@ def g123_calibration( ) def plot_g123_gwss_track( self, - contig: base_params.contig, + region: base_params.region, window_size: g123_params.window_size, sites: g123_params.sites = base_params.DEFAULT, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, @@ -338,7 +338,7 @@ def plot_g123_gwss_track( ) -> gplt_params.optional_figure: # compute G123 x, g123 = self.g123_gwss( - contig=contig, + region=region, sites=sites, site_mask=site_mask, window_size=window_size, @@ -401,7 +401,7 @@ def plot_g123_gwss_track( # tidy up the plot fig.yaxis.axis_label = "G123" fig.yaxis.ticker = [0, 1] - self._bokeh_style_genome_xaxis(fig, contig) + self._bokeh_style_genome_xaxis(fig, region) if show: # pragma: no cover bokeh.plotting.show(fig) @@ -415,7 +415,7 @@ def plot_g123_gwss_track( ) def plot_g123_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: g123_params.window_size, sites: g123_params.sites = base_params.DEFAULT, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, @@ -443,7 +443,7 @@ def plot_g123_gwss( ) -> gplt_params.optional_figure: # gwss track fig1 = self.plot_g123_gwss_track( - contig=contig, + region=region, sites=sites, site_mask=site_mask, window_size=window_size, @@ -467,7 +467,7 @@ def plot_g123_gwss( # plot genes fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, @@ -500,7 +500,7 @@ def plot_g123_gwss( ) def plot_g123_calibration( self, - contig: base_params.contig, + region: base_params.region, sites: g123_params.sites, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, sample_query: Optional[base_params.sample_query] = None, @@ -521,7 +521,7 @@ def plot_g123_calibration( ) -> gplt_params.optional_figure: # get g123 values calibration_runs = self.g123_calibration( - contig=contig, + region=region, sites=sites, site_mask=site_mask, sample_query=sample_query, diff --git a/notebooks/plot_g123_gwss.ipynb b/notebooks/plot_g123_gwss.ipynb index 73ac584af..396b241dc 100644 --- a/notebooks/plot_g123_gwss.ipynb +++ b/notebooks/plot_g123_gwss.ipynb @@ -68,7 +68,7 @@ "source": [ "%%time\n", "ag3.plot_g123_calibration(\n", - " contig=contig,\n", + " region=contig,\n", " sites=site_mask,\n", " sample_sets=sample_set,\n", " sample_query=sample_query,\n", @@ -85,7 +85,7 @@ "source": [ "%%time\n", "ag3.plot_g123_calibration(\n", - " contig=contig,\n", + " region=contig,\n", " sites=site_mask,\n", " sample_sets=sample_set,\n", " sample_query=sample_query,\n", @@ -102,7 +102,7 @@ "source": [ "%%time\n", "ag3.plot_g123_gwss_track(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1_000,\n", " site_mask=site_mask,\n", " sites=site_mask,\n", @@ -121,7 +121,7 @@ "source": [ "%%time\n", "ag3.plot_g123_gwss_track(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1_000,\n", " site_mask=site_mask,\n", " sites=site_mask,\n", @@ -140,7 +140,7 @@ "source": [ "%%time\n", "ag3.plot_g123_gwss_track(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1_000,\n", " site_mask=site_mask,\n", " sites=\"segregating\",\n", @@ -161,7 +161,7 @@ "source": [ "%%time\n", "ag3.plot_g123_gwss(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1_000,\n", " site_mask=site_mask,\n", " sites=\"all\",\n", @@ -181,7 +181,7 @@ "source": [ "%%time\n", "ag3.plot_g123_gwss(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1_000,\n", " site_mask=site_mask,\n", " sites=site_mask,\n", @@ -219,7 +219,7 @@ "outputs": [], "source": [ "af1.plot_g123_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=2_000,\n", " sites=\"segregating\",\n", " site_mask=\"funestus\",\n", @@ -237,7 +237,7 @@ "outputs": [], "source": [ "af1.plot_g123_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=2000,\n", " site_mask=\"funestus\",\n", " sites=\"funestus\",\n", @@ -258,7 +258,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "mgen_data_py3.11", "language": "python", "name": "python3" }, @@ -272,7 +272,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/tests/anoph/test_g123.py b/tests/anoph/test_g123.py index 59b5936ca..4bd5cb4ab 100644 --- a/tests/anoph/test_g123.py +++ b/tests/anoph/test_g123.py @@ -101,7 +101,7 @@ def test_g123_gwss_with_default_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, @@ -116,7 +116,7 @@ def test_g123_gwss_with_phased_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sites=random.choice(api.phasing_analysis_ids), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), @@ -132,7 +132,7 @@ def test_g123_gwss_with_segregating_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sites="segregating", site_mask=random.choice(api.site_mask_ids), sample_sets=[random.choice(all_sample_sets)], @@ -149,7 +149,7 @@ def test_g123_gwss_with_all_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sites="all", site_mask=None, sample_sets=[random.choice(all_sample_sets)], @@ -166,7 +166,7 @@ def test_g123_gwss_with_bad_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, @@ -185,7 +185,7 @@ def test_g123_calibration(fixture, api: AnophelesG123Analysis): window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist() window_sizes = sorted([int(x) for x in window_sizes]) g123_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sites=random.choice(api.phasing_analysis_ids), sample_sets=[random.choice(all_sample_sets)], min_cohort_size=10, From 226c471ed989716a0b0fd78d809a59234ca8132b Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 6 Dec 2024 20:19:01 +0000 Subject: [PATCH 03/21] Replace contig with region in iHS GWSS functions and tests --- malariagen_data/anopheles.py | 20 ++++++++++---------- notebooks/plot_ihs_gwss.ipynb | 16 ++++++++-------- tests/test_ag3.py | 4 ++-- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index d932b39ca..9eb26e761 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -1279,7 +1279,7 @@ def plot_diversity_stats( ) def ihs_gwss( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, sample_query: Optional[base_params.sample_query] = None, @@ -1312,7 +1312,7 @@ def ihs_gwss( name = self._ihs_gwss_cache_name params = dict( - contig=contig, + region=region, analysis=self._prep_phasing_analysis_param(analysis=analysis), window_size=window_size, percentiles=percentiles, @@ -1353,7 +1353,7 @@ def ihs_gwss( def _ihs_gwss( self, *, - contig, + region, analysis, sample_sets, sample_query, @@ -1378,7 +1378,7 @@ def _ihs_gwss( inline_array, ): ds_haps = self.haplotypes( - region=contig, + region=region, analysis=analysis, sample_query=sample_query, sample_query_options=sample_query_options, @@ -1452,7 +1452,7 @@ def _ihs_gwss( ) def plot_ihs_gwss_track( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, sample_query: Optional[base_params.sample_query] = None, @@ -1490,7 +1490,7 @@ def plot_ihs_gwss_track( ) -> gplt_params.optional_figure: # compute ihs x, ihs = self.ihs_gwss( - contig=contig, + region=region, analysis=analysis, window_size=window_size, percentiles=percentiles, @@ -1584,7 +1584,7 @@ def plot_ihs_gwss_track( # tidy up the plot fig.yaxis.axis_label = "ihs" - self._bokeh_style_genome_xaxis(fig, contig) + self._bokeh_style_genome_xaxis(fig, region) if show: # pragma: no cover bokeh.plotting.show(fig) @@ -1700,7 +1700,7 @@ def plot_xpehh_gwss( ) def plot_ihs_gwss( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, sample_query: Optional[base_params.sample_query] = None, @@ -1740,7 +1740,7 @@ def plot_ihs_gwss( ) -> gplt_params.optional_figure: # gwss track fig1 = self.plot_ihs_gwss_track( - contig=contig, + region=region, analysis=analysis, sample_sets=sample_sets, sample_query=sample_query, @@ -1776,7 +1776,7 @@ def plot_ihs_gwss( # plot genes fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, diff --git a/notebooks/plot_ihs_gwss.ipynb b/notebooks/plot_ihs_gwss.ipynb index 4a8fa53c7..d08ed48dc 100644 --- a/notebooks/plot_ihs_gwss.ipynb +++ b/notebooks/plot_ihs_gwss.ipynb @@ -42,7 +42,7 @@ "outputs": [], "source": [ "ag3.plot_ihs_gwss_track(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1_000,\n", " analysis=\"gamb_colu\",\n", " percentiles=(50, 60, 90, 100),\n", @@ -60,7 +60,7 @@ "outputs": [], "source": [ "ag3.plot_ihs_gwss_track(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1000,\n", " analysis=\"gamb_colu\",\n", " sample_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", @@ -81,7 +81,7 @@ "outputs": [], "source": [ "ag3.plot_ihs_gwss(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=100,\n", " sample_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", " sample_sets=\"3.0\",\n", @@ -99,7 +99,7 @@ "outputs": [], "source": [ "ag3.plot_ihs_gwss(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=1_000,\n", " sample_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", " sample_sets=\"3.0\",\n", @@ -127,7 +127,7 @@ "outputs": [], "source": [ "af1.plot_ihs_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=2_000,\n", " sample_query=\"cohort_admin1_year == 'KE-03_fune_2016'\",\n", " sample_sets=\"1.0\",\n", @@ -143,7 +143,7 @@ "outputs": [], "source": [ "af1.plot_ihs_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=200,\n", " sample_query=\"cohort_admin1_year == 'KE-03_fune_2016'\",\n", " sample_sets=\"1.0\",\n", @@ -162,7 +162,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "mgen_data_py3.11", "language": "python", "name": "python3" }, @@ -176,7 +176,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/tests/test_ag3.py b/tests/test_ag3.py index 8536a56f7..40bf467b2 100644 --- a/tests/test_ag3.py +++ b/tests/test_ag3.py @@ -102,13 +102,13 @@ def test_locate_region(region_raw): def test_ihs_gwss(): ag3 = setup_ag3(cohorts_analysis="20230516") sample_query = "country == 'Ghana'" - contig = "3L" + region = "3L" analysis = "gamb_colu" sample_sets = "3.0" window_size = 1000 x, ihs = ag3.ihs_gwss( - contig=contig, + region=region, analysis=analysis, sample_query=sample_query, sample_sets=sample_sets, From 1409756f9e813b28ce3844ccf5189619fe1b62a2 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 6 Dec 2024 20:22:41 +0000 Subject: [PATCH 04/21] Change iHS GWSS cache names --- malariagen_data/af1.py | 2 +- malariagen_data/ag3.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py index 546e45fb0..a79b427b1 100644 --- a/malariagen_data/af1.py +++ b/malariagen_data/af1.py @@ -13,7 +13,7 @@ "us-central1": "gs://vo_afun_release_master_us_central1", } XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1" -IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1" +IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v2" TAXON_PALETTE = px.colors.qualitative.Plotly TAXON_COLORS = { diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 316008a79..422726e06 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -26,7 +26,7 @@ "us-central1": "gs://vo_agam_release_master_us_central1", } XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" -IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1" +IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v2" VIRTUAL_CONTIGS = { "2RL": ("2R", "2L"), "3RL": ("3R", "3L"), From 65c048f2b00ca62b24537e12e7d6550c01b21d29 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:14:47 +0000 Subject: [PATCH 05/21] Replace contig with region in FST GWSS functions and tests --- malariagen_data/anoph/fst.py | 24 ++++++++++++------------ notebooks/plot_fst_gwss.ipynb | 16 ++++++++-------- tests/anoph/test_fst.py | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 086d81824..858e35bc4 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -29,7 +29,7 @@ def __init__( def _fst_gwss( self, *, - contig, + region, window_size, sample_sets, cohort1_query, @@ -46,7 +46,7 @@ def _fst_gwss( ): # Compute allele counts. ac1 = self.snp_allele_counts( - region=contig, + region=region, sample_query=cohort1_query, sample_query_options=sample_query_options, sample_sets=sample_sets, @@ -59,7 +59,7 @@ def _fst_gwss( chunks=chunks, ) ac2 = self.snp_allele_counts( - region=contig, + region=region, sample_query=cohort2_query, sample_query_options=sample_query_options, sample_sets=sample_sets, @@ -74,7 +74,7 @@ def _fst_gwss( with self._spinner(desc="Load SNP positions"): pos = self.snp_sites( - region=contig, + region=region, field="POS", site_mask=site_mask, inline_array=inline_array, @@ -105,7 +105,7 @@ def _fst_gwss( ) def fst_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: fst_params.window_size, cohort1_query: base_params.sample_query, cohort2_query: base_params.sample_query, @@ -129,7 +129,7 @@ def fst_gwss( name = "fst_gwss_v2" params = dict( - contig=contig, + region=region, window_size=window_size, cohort1_query=cohort1_query, cohort2_query=cohort2_query, @@ -164,7 +164,7 @@ def fst_gwss( ) def plot_fst_gwss_track( self, - contig: base_params.contig, + region: base_params.region, window_size: fst_params.window_size, cohort1_query: base_params.sample_query, cohort2_query: base_params.sample_query, @@ -190,7 +190,7 @@ def plot_fst_gwss_track( ) -> gplt_params.optional_figure: # compute Fst x, fst = self.fst_gwss( - contig=contig, + region=region, window_size=window_size, cohort_size=cohort_size, min_cohort_size=min_cohort_size, @@ -253,7 +253,7 @@ def plot_fst_gwss_track( # tidy up the plot fig.yaxis.axis_label = "Fst" fig.yaxis.ticker = [0, 1] - self._bokeh_style_genome_xaxis(fig, contig) + self._bokeh_style_genome_xaxis(fig, region) if show: # pragma: no cover bokeh.plotting.show(fig) @@ -270,7 +270,7 @@ def plot_fst_gwss_track( ) def plot_fst_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: fst_params.window_size, cohort1_query: base_params.sample_query, cohort2_query: base_params.sample_query, @@ -298,7 +298,7 @@ def plot_fst_gwss( ) -> gplt_params.optional_figure: # gwss track fig1 = self.plot_fst_gwss_track( - contig=contig, + region=region, window_size=window_size, cohort1_query=cohort1_query, cohort2_query=cohort2_query, @@ -322,7 +322,7 @@ def plot_fst_gwss( # plot genes fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, diff --git a/notebooks/plot_fst_gwss.ipynb b/notebooks/plot_fst_gwss.ipynb index a627356d6..462746296 100644 --- a/notebooks/plot_fst_gwss.ipynb +++ b/notebooks/plot_fst_gwss.ipynb @@ -54,7 +54,7 @@ "outputs": [], "source": [ "ag3.plot_fst_gwss_track(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=10_000,\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", " cohort2_query=\"cohort_admin2_year == 'ML-2_Kati_gamb_2014'\",\n", @@ -72,7 +72,7 @@ "outputs": [], "source": [ "ag3.plot_fst_gwss_track(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=10_000,\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", " cohort2_query=\"cohort_admin2_year == 'ML-2_Kati_gamb_2014'\",\n", @@ -92,7 +92,7 @@ "outputs": [], "source": [ "ag3.plot_fst_gwss(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=10_000,\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", " cohort2_query=\"cohort_admin2_year == 'ML-2_Kati_gamb_2014'\",\n", @@ -110,7 +110,7 @@ "outputs": [], "source": [ "ag3.plot_fst_gwss(\n", - " contig=\"2L\",\n", + " region=\"2L\",\n", " window_size=10_000,\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", " cohort2_query=\"country == 'Mayotte' and taxon == 'gambiae'\",\n", @@ -128,7 +128,7 @@ "outputs": [], "source": [ "af1.plot_fst_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=20_000,\n", " cohort1_query=\"cohort_admin1_year == 'KE-03_fune_2016'\",\n", " cohort2_query=\"cohort_admin1_year == 'MZ-L_fune_2016'\",\n", @@ -145,7 +145,7 @@ "outputs": [], "source": [ "af1.plot_fst_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=20_000,\n", " cohort1_query=\"cohort_admin1_year == 'KE-03_fune_2016'\",\n", " cohort2_query=\"cohort_admin1_year == 'MZ-L_fune_2016'\",\n", @@ -173,7 +173,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "mgen_data_py3.11", "language": "python", "name": "python3" }, @@ -187,7 +187,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index 42f59e103..f205a838b 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -84,7 +84,7 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): cohort1_query = f"country == {countries[0]!r}" cohort2_query = f"country == {countries[1]!r}" fst_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=all_sample_sets, cohort1_query=cohort1_query, cohort2_query=cohort2_query, From bf5b2f9a5393e241794c6ba3726fabaac5ed2075 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:33:16 +0000 Subject: [PATCH 06/21] Change cache name for fst_gwss() --- malariagen_data/anoph/fst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 858e35bc4..11297ddf0 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -126,7 +126,7 @@ def fst_gwss( ) -> Tuple[np.ndarray, np.ndarray]: # Change this name if you ever change the behaviour of this function, to # invalidate any previously cached data. - name = "fst_gwss_v2" + name = "fst_gwss_v3" params = dict( region=region, From ab45382465814f349c5330ac6cadea38d80a658a Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:02:57 +0000 Subject: [PATCH 07/21] Replace contig with region in H1X GWSS functions and tests. Change cache name. --- malariagen_data/anoph/h1x.py | 24 ++++++++++++------------ notebooks/plot_h12_h1x.ipynb | 6 +++--- tests/anoph/test_h1x.py | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/malariagen_data/anoph/h1x.py b/malariagen_data/anoph/h1x.py index 79e5b1ab7..1f0927642 100644 --- a/malariagen_data/anoph/h1x.py +++ b/malariagen_data/anoph/h1x.py @@ -26,7 +26,7 @@ def __init__( def _h1x_gwss( self, *, - contig, + region, analysis, window_size, sample_sets, @@ -42,7 +42,7 @@ def _h1x_gwss( ): # Access haplotype datasets for each cohort. ds1 = self.haplotypes( - region=contig, + region=region, analysis=analysis, sample_query=cohort1_query, sample_query_options=sample_query_options, @@ -55,7 +55,7 @@ def _h1x_gwss( inline_array=inline_array, ) ds2 = self.haplotypes( - region=contig, + region=region, analysis=analysis, sample_query=cohort2_query, sample_query_options=sample_query_options, @@ -110,7 +110,7 @@ def _h1x_gwss( ) def h1x_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: h12_params.window_size, cohort1_query: base_params.sample_query, cohort2_query: base_params.sample_query, @@ -130,10 +130,10 @@ def h1x_gwss( ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: # Change this name if you ever change the behaviour of this function, to # invalidate any previously cached data. - name = "h1x_gwss_v2" + name = "h1x_gwss_v3" params = dict( - contig=contig, + region=region, analysis=self._prep_phasing_analysis_param(analysis=analysis), window_size=window_size, # N.B., do not be tempted to convert these sample queries into integer @@ -171,7 +171,7 @@ def h1x_gwss( ) def plot_h1x_gwss_track( self, - contig: base_params.contig, + region: base_params.region, window_size: h12_params.window_size, cohort1_query: base_params.cohort1_query, cohort2_query: base_params.cohort2_query, @@ -199,7 +199,7 @@ def plot_h1x_gwss_track( ) -> gplt_params.optional_figure: # Compute H1X. x, h1x, contigs = self.h1x_gwss( - contig=contig, + region=region, analysis=analysis, window_size=window_size, cohort_size=cohort_size, @@ -262,7 +262,7 @@ def plot_h1x_gwss_track( # Tidy up the plot. fig.yaxis.axis_label = "H1X" fig.yaxis.ticker = [0, 1] - self._bokeh_style_genome_xaxis(fig, contig) + self._bokeh_style_genome_xaxis(fig, region) if show: # pragma: no cover bokeh.plotting.show(fig) @@ -279,7 +279,7 @@ def plot_h1x_gwss_track( ) def plot_h1x_gwss( self, - contig: base_params.contig, + region: base_params.region, window_size: h12_params.window_size, cohort1_query: base_params.cohort1_query, cohort2_query: base_params.cohort2_query, @@ -309,7 +309,7 @@ def plot_h1x_gwss( ) -> gplt_params.optional_figure: # Plot GWSS track. fig1 = self.plot_h1x_gwss_track( - contig=contig, + region=region, analysis=analysis, window_size=window_size, cohort1_query=cohort1_query, @@ -335,7 +335,7 @@ def plot_h1x_gwss( # Plot genes. fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, diff --git a/notebooks/plot_h12_h1x.ipynb b/notebooks/plot_h12_h1x.ipynb index e7b17b2bf..7adf51f32 100644 --- a/notebooks/plot_h12_h1x.ipynb +++ b/notebooks/plot_h12_h1x.ipynb @@ -181,7 +181,7 @@ "outputs": [], "source": [ "ag3.plot_h1x_gwss(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=2000,\n", " cohort1_query=coh1_query,\n", " cohort2_query=coh2_query,\n", @@ -199,7 +199,7 @@ "outputs": [], "source": [ "ag3.plot_h1x_gwss(\n", - " contig=contigs,\n", + " region=contigs,\n", " window_size=2000,\n", " cohort1_query=coh1_query,\n", " cohort2_query=coh2_query,\n", @@ -409,7 +409,7 @@ "outputs": [], "source": [ "af1.plot_h1x_gwss(\n", - " contig=contig,\n", + " region=contig,\n", " window_size=2000,\n", " cohort1_query=coh1_query,\n", " cohort2_query=coh2_query,\n", diff --git a/tests/anoph/test_h1x.py b/tests/anoph/test_h1x.py index b7b5362d6..2a21fc855 100644 --- a/tests/anoph/test_h1x.py +++ b/tests/anoph/test_h1x.py @@ -143,7 +143,7 @@ def test_h1x_gwss_with_default_analysis(fixture, api: AnophelesH1XAnalysis): cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h1x_params = dict( - contig=random.choice(api.contigs), + region=random.choice(api.contigs), sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=1, @@ -194,7 +194,7 @@ def test_h1x_gwss_with_analysis(fixture, api: AnophelesH1XAnalysis): # Samples are available, run full checks. h1x_params = dict( analysis=analysis, - contig=contig, + region=contig, sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=min(n1, n2), From 95fb659eae9ae07f378d36802f5bf376a4260750 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:19:24 +0000 Subject: [PATCH 08/21] Replace contig with region in XP-EHH GWSS functions and tests. Change cache names. --- malariagen_data/af1.py | 2 +- malariagen_data/ag3.py | 2 +- malariagen_data/anopheles.py | 22 +++++++++++----------- notebooks/plot_xpehh_gwss.ipynb | 18 +++++++++--------- tests/integration/test_ag3.py | 2 +- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py index a79b427b1..f1d09ec1d 100644 --- a/malariagen_data/af1.py +++ b/malariagen_data/af1.py @@ -12,7 +12,7 @@ GCS_REGION_URLS = { "us-central1": "gs://vo_afun_release_master_us_central1", } -XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v2" IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v2" TAXON_PALETTE = px.colors.qualitative.Plotly diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 422726e06..09e31633d 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -25,7 +25,7 @@ GCS_REGION_URLS = { "us-central1": "gs://vo_agam_release_master_us_central1", } -XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v2" IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v2" VIRTUAL_CONTIGS = { "2RL": ("2R", "2L"), diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 9eb26e761..15db88b9f 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -1598,7 +1598,7 @@ def plot_ihs_gwss_track( ) def plot_xpehh_gwss( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, cohort1_query: Optional[base_params.sample_query] = None, @@ -1635,7 +1635,7 @@ def plot_xpehh_gwss( ) -> gplt_params.optional_figure: # gwss track fig1 = self.plot_xpehh_gwss_track( - contig=contig, + region=region, analysis=analysis, sample_sets=sample_sets, cohort1_query=cohort1_query, @@ -1669,7 +1669,7 @@ def plot_xpehh_gwss( # plot genes fig2 = self.plot_genes( - region=contig, + region=region, sizing_mode=sizing_mode, width=width, height=genes_height, @@ -1812,7 +1812,7 @@ def plot_ihs_gwss( ) def xpehh_gwss( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, cohort1_query: Optional[base_params.sample_query] = None, @@ -1842,7 +1842,7 @@ def xpehh_gwss( name = self._xpehh_gwss_cache_name params = dict( - contig=contig, + region=region, analysis=self._prep_phasing_analysis_param(analysis=analysis), window_size=window_size, percentiles=percentiles, @@ -1882,7 +1882,7 @@ def xpehh_gwss( def _xpehh_gwss( self, *, - contig, + region, analysis, sample_sets, cohort1_query, @@ -1904,7 +1904,7 @@ def _xpehh_gwss( inline_array, ): ds_haps1 = self.haplotypes( - region=contig, + region=region, analysis=analysis, sample_query=cohort1_query, sample_query_options=sample_query_options, @@ -1917,7 +1917,7 @@ def _xpehh_gwss( ) ds_haps2 = self.haplotypes( - region=contig, + region=region, analysis=analysis, sample_query=cohort2_query, sample_query_options=sample_query_options, @@ -1988,7 +1988,7 @@ def _xpehh_gwss( ) def plot_xpehh_gwss_track( self, - contig: base_params.contig, + region: base_params.region, analysis: hap_params.analysis = base_params.DEFAULT, sample_sets: Optional[base_params.sample_sets] = None, cohort1_query: Optional[base_params.sample_query] = None, @@ -2023,7 +2023,7 @@ def plot_xpehh_gwss_track( ) -> gplt_params.optional_figure: # compute xpehh x, xpehh = self.xpehh_gwss( - contig=contig, + region=region, analysis=analysis, window_size=window_size, percentiles=percentiles, @@ -2116,7 +2116,7 @@ def plot_xpehh_gwss_track( # tidy up the plot fig.yaxis.axis_label = "XP-EHH" - self._bokeh_style_genome_xaxis(fig, contig) + self._bokeh_style_genome_xaxis(fig, region) if show: # pragma: no cover bokeh.plotting.show(fig) diff --git a/notebooks/plot_xpehh_gwss.ipynb b/notebooks/plot_xpehh_gwss.ipynb index cd9e67583..2155ea075 100644 --- a/notebooks/plot_xpehh_gwss.ipynb +++ b/notebooks/plot_xpehh_gwss.ipynb @@ -40,7 +40,7 @@ "outputs": [], "source": [ "ag3.plot_xpehh_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=1_000,\n", " analysis=\"gamb_colu\",\n", " percentiles=(50, 60, 90, 100),\n", @@ -59,7 +59,7 @@ "outputs": [], "source": [ "ag3.plot_xpehh_gwss_track(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=1_000,\n", " analysis=\"gamb_colu\",\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", @@ -80,7 +80,7 @@ "outputs": [], "source": [ "ag3.plot_xpehh_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=100,\n", " analysis=\"gamb_colu\",\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", @@ -98,7 +98,7 @@ "outputs": [], "source": [ "ag3.plot_xpehh_gwss(\n", - " contig=\"2RL\",\n", + " region=\"2RL\",\n", " window_size=100,\n", " analysis=\"gamb_colu\",\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", @@ -116,7 +116,7 @@ "outputs": [], "source": [ "ag3.plot_xpehh_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=1_000,\n", " analysis=\"gamb_colu\",\n", " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", @@ -145,7 +145,7 @@ "outputs": [], "source": [ "af1.plot_xpehh_gwss(\n", - " contig=\"X\",\n", + " region=\"X\",\n", " window_size=2_000,\n", " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", " cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n", @@ -162,7 +162,7 @@ "outputs": [], "source": [ "af1.plot_xpehh_gwss(\n", - " contig=\"2RL\",\n", + " region=\"2RL\",\n", " window_size=200,\n", " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", " cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n", @@ -182,7 +182,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "mgen_data_py3.11", "language": "python", "name": "python3" }, @@ -196,7 +196,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/tests/integration/test_ag3.py b/tests/integration/test_ag3.py index 40bf467b2..ae3681fe1 100644 --- a/tests/integration/test_ag3.py +++ b/tests/integration/test_ag3.py @@ -138,7 +138,7 @@ def test_xpehh_gwss(): window_size = 1000 x, xpehh = ag3.xpehh_gwss( - contig=contig, + region=contig, analysis=analysis, cohort1_query=cohort1_query, cohort2_query=cohort2_query, From d7a76987a8bbc54a26962436fbad1920b3b9215f Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:07:16 +0000 Subject: [PATCH 09/21] WIP: use random_region_str() for random region in GWSS function tests --- tests/anoph/test_fst.py | 2 +- tests/anoph/test_g123.py | 12 ++++++------ tests/anoph/test_h12.py | 8 ++++---- tests/anoph/test_h1x.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index f205a838b..c52c832b6 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -84,7 +84,7 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): cohort1_query = f"country == {countries[0]!r}" cohort2_query = f"country == {countries[1]!r}" fst_params = dict( - region=random.choice(api.contigs), + region=fixture.random_region_str(), sample_sets=all_sample_sets, cohort1_query=cohort1_query, cohort2_query=cohort2_query, diff --git a/tests/anoph/test_g123.py b/tests/anoph/test_g123.py index 4bd5cb4ab..4dc5e4542 100644 --- a/tests/anoph/test_g123.py +++ b/tests/anoph/test_g123.py @@ -101,7 +101,7 @@ def test_g123_gwss_with_default_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, @@ -116,7 +116,7 @@ def test_g123_gwss_with_phased_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sites=random.choice(api.phasing_analysis_ids), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), @@ -132,7 +132,7 @@ def test_g123_gwss_with_segregating_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), + region=fixture.random_region_str(), sites="segregating", site_mask=random.choice(api.site_mask_ids), sample_sets=[random.choice(all_sample_sets)], @@ -149,7 +149,7 @@ def test_g123_gwss_with_all_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), + region=fixture.random_region_str(), sites="all", site_mask=None, sample_sets=[random.choice(all_sample_sets)], @@ -166,7 +166,7 @@ def test_g123_gwss_with_bad_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), + region=fixture.random_region_str(), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, @@ -185,7 +185,7 @@ def test_g123_calibration(fixture, api: AnophelesG123Analysis): window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist() window_sizes = sorted([int(x) for x in window_sizes]) g123_params = dict( - region=random.choice(api.contigs), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sites=random.choice(api.phasing_analysis_ids), sample_sets=[random.choice(all_sample_sets)], min_cohort_size=10, diff --git a/tests/anoph/test_h12.py b/tests/anoph/test_h12.py index 4e62def5c..06fb52051 100644 --- a/tests/anoph/test_h12.py +++ b/tests/anoph/test_h12.py @@ -105,7 +105,7 @@ def test_h12_calibration(fixture, api: AnophelesH12Analysis): window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist() window_sizes = sorted(set([int(x) for x in window_sizes])) h12_params = dict( - region=random.choice(api.contigs), + region=fixture.random_region_str(), sample_sets=[random.choice(all_sample_sets)], window_sizes=window_sizes, min_cohort_size=5, @@ -168,7 +168,7 @@ def test_h12_gwss_with_default_analysis(fixture, api: AnophelesH12Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() h12_params = dict( - region=random.choice(api.contigs), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=5, @@ -236,7 +236,7 @@ def test_h12_gwss_multi_with_default_analysis(fixture, api: AnophelesH12Analysis cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h12_params = dict( - region=random.choice(api.contigs), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=1, @@ -256,7 +256,7 @@ def test_h12_gwss_multi_with_window_size_dict(fixture, api: AnophelesH12Analysis cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h12_params = dict( - region=random.choice(api.contigs), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=all_sample_sets, window_size={ "cohort1": random.randint(100, 500), diff --git a/tests/anoph/test_h1x.py b/tests/anoph/test_h1x.py index 2a21fc855..ed1e21f94 100644 --- a/tests/anoph/test_h1x.py +++ b/tests/anoph/test_h1x.py @@ -143,7 +143,7 @@ def test_h1x_gwss_with_default_analysis(fixture, api: AnophelesH1XAnalysis): cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h1x_params = dict( - region=random.choice(api.contigs), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=1, From 8dfa5f8cb579a330cd2d34afd337b99d16b242fd Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 17 Jan 2025 15:59:55 +0000 Subject: [PATCH 10/21] WIP: use random contig for GWSS function tests --- tests/anoph/test_fst.py | 2 +- tests/anoph/test_g123.py | 6 +++--- tests/anoph/test_h12.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index c52c832b6..896af3fd2 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -84,7 +84,7 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): cohort1_query = f"country == {countries[0]!r}" cohort2_query = f"country == {countries[1]!r}" fst_params = dict( - region=fixture.random_region_str(), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=all_sample_sets, cohort1_query=cohort1_query, cohort2_query=cohort2_query, diff --git a/tests/anoph/test_g123.py b/tests/anoph/test_g123.py index 4dc5e4542..76a544013 100644 --- a/tests/anoph/test_g123.py +++ b/tests/anoph/test_g123.py @@ -132,7 +132,7 @@ def test_g123_gwss_with_segregating_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=fixture.random_region_str(), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sites="segregating", site_mask=random.choice(api.site_mask_ids), sample_sets=[random.choice(all_sample_sets)], @@ -149,7 +149,7 @@ def test_g123_gwss_with_all_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=fixture.random_region_str(), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sites="all", site_mask=None, sample_sets=[random.choice(all_sample_sets)], @@ -166,7 +166,7 @@ def test_g123_gwss_with_bad_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=fixture.random_region_str(), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, diff --git a/tests/anoph/test_h12.py b/tests/anoph/test_h12.py index 06fb52051..dc2ba7045 100644 --- a/tests/anoph/test_h12.py +++ b/tests/anoph/test_h12.py @@ -105,7 +105,7 @@ def test_h12_calibration(fixture, api: AnophelesH12Analysis): window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist() window_sizes = sorted(set([int(x) for x in window_sizes])) h12_params = dict( - region=fixture.random_region_str(), + region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), sample_sets=[random.choice(all_sample_sets)], window_sizes=window_sizes, min_cohort_size=5, From e668189f923ce532b046e6844d3f032efae661f7 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:04:24 +0000 Subject: [PATCH 11/21] Use random_region_str() instead of random contig for test_fst_gwss() --- tests/anoph/test_fst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index 896af3fd2..c52c832b6 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -84,7 +84,7 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): cohort1_query = f"country == {countries[0]!r}" cohort2_query = f"country == {countries[1]!r}" fst_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(), sample_sets=all_sample_sets, cohort1_query=cohort1_query, cohort2_query=cohort2_query, From b1e0e24fd362d8a2b3d56116845d683730839071 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:48:10 +0000 Subject: [PATCH 12/21] Use random_region_str() instead of random contig for test_g123_gwss_with_default_sites() --- tests/anoph/test_g123.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/anoph/test_g123.py b/tests/anoph/test_g123.py index 76a544013..597c05c57 100644 --- a/tests/anoph/test_g123.py +++ b/tests/anoph/test_g123.py @@ -101,7 +101,7 @@ def test_g123_gwss_with_default_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, From 97c63f3b2382eb20732519f998d0e1c20ea1cb6b Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Thu, 23 Jan 2025 12:40:15 +0000 Subject: [PATCH 13/21] Replcase random contig with random region of fixed size in gwss function tests --- tests/anoph/test_g123.py | 12 ++++++------ tests/anoph/test_h12.py | 8 ++++---- tests/anoph/test_h1x.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/anoph/test_g123.py b/tests/anoph/test_g123.py index 597c05c57..356aa03f0 100644 --- a/tests/anoph/test_g123.py +++ b/tests/anoph/test_g123.py @@ -101,7 +101,7 @@ def test_g123_gwss_with_default_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, @@ -116,7 +116,7 @@ def test_g123_gwss_with_phased_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=10_000), sites=random.choice(api.phasing_analysis_ids), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), @@ -132,7 +132,7 @@ def test_g123_gwss_with_segregating_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sites="segregating", site_mask=random.choice(api.site_mask_ids), sample_sets=[random.choice(all_sample_sets)], @@ -149,7 +149,7 @@ def test_g123_gwss_with_all_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sites="all", site_mask=None, sample_sets=[random.choice(all_sample_sets)], @@ -166,7 +166,7 @@ def test_g123_gwss_with_bad_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, @@ -185,7 +185,7 @@ def test_g123_calibration(fixture, api: AnophelesG123Analysis): window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist() window_sizes = sorted([int(x) for x in window_sizes]) g123_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=10_000), sites=random.choice(api.phasing_analysis_ids), sample_sets=[random.choice(all_sample_sets)], min_cohort_size=10, diff --git a/tests/anoph/test_h12.py b/tests/anoph/test_h12.py index dc2ba7045..8a86f7105 100644 --- a/tests/anoph/test_h12.py +++ b/tests/anoph/test_h12.py @@ -105,7 +105,7 @@ def test_h12_calibration(fixture, api: AnophelesH12Analysis): window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist() window_sizes = sorted(set([int(x) for x in window_sizes])) h12_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=[random.choice(all_sample_sets)], window_sizes=window_sizes, min_cohort_size=5, @@ -168,7 +168,7 @@ def test_h12_gwss_with_default_analysis(fixture, api: AnophelesH12Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() h12_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=5, @@ -236,7 +236,7 @@ def test_h12_gwss_multi_with_default_analysis(fixture, api: AnophelesH12Analysis cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h12_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=1, @@ -256,7 +256,7 @@ def test_h12_gwss_multi_with_window_size_dict(fixture, api: AnophelesH12Analysis cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h12_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=all_sample_sets, window_size={ "cohort1": random.randint(100, 500), diff --git a/tests/anoph/test_h1x.py b/tests/anoph/test_h1x.py index ed1e21f94..1b560778d 100644 --- a/tests/anoph/test_h1x.py +++ b/tests/anoph/test_h1x.py @@ -143,7 +143,7 @@ def test_h1x_gwss_with_default_analysis(fixture, api: AnophelesH1XAnalysis): cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h1x_params = dict( - region=random.choice(api.contigs), # FIXME: region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=1, From aa41bbdef4a7f9dcaecb2ad7f487ab4f37f86cde Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:55:22 +0000 Subject: [PATCH 14/21] Add region_size for random_region_str in test_fst_gwss() --- tests/anoph/test_fst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py index c52c832b6..a7750105f 100644 --- a/tests/anoph/test_fst.py +++ b/tests/anoph/test_fst.py @@ -84,7 +84,7 @@ def test_fst_gwss(fixture, api: AnophelesFstAnalysis): cohort1_query = f"country == {countries[0]!r}" cohort2_query = f"country == {countries[1]!r}" fst_params = dict( - region=fixture.random_region_str(), + region=fixture.random_region_str(region_size=5000), sample_sets=all_sample_sets, cohort1_query=cohort1_query, cohort2_query=cohort2_query, From 2221fd7229e03cdbc2a78513430edfd6b7c0e555 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 3 Feb 2025 12:12:01 +0000 Subject: [PATCH 15/21] Increase random region size to 10_000 for test_g123_gwss_with_default_sites() and test_h12_gwss_multi_with_default_analysis() --- tests/anoph/test_g123.py | 2 +- tests/anoph/test_h12.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/anoph/test_g123.py b/tests/anoph/test_g123.py index 356aa03f0..4d18947a1 100644 --- a/tests/anoph/test_g123.py +++ b/tests/anoph/test_g123.py @@ -101,7 +101,7 @@ def test_g123_gwss_with_default_sites(fixture, api: AnophelesG123Analysis): # Set up test parameters. all_sample_sets = api.sample_sets()["sample_set"].to_list() g123_params = dict( - region=fixture.random_region_str(region_size=5000), + region=fixture.random_region_str(region_size=10_000), sample_sets=[random.choice(all_sample_sets)], window_size=random.randint(100, 500), min_cohort_size=10, diff --git a/tests/anoph/test_h12.py b/tests/anoph/test_h12.py index 8a86f7105..d417d3881 100644 --- a/tests/anoph/test_h12.py +++ b/tests/anoph/test_h12.py @@ -236,7 +236,7 @@ def test_h12_gwss_multi_with_default_analysis(fixture, api: AnophelesH12Analysis cohort1_query = f"country == '{country1}'" cohort2_query = f"country == '{country2}'" h12_params = dict( - region=fixture.random_region_str(region_size=5000), + region=fixture.random_region_str(region_size=10_000), sample_sets=all_sample_sets, window_size=random.randint(100, 500), min_cohort_size=1, From 69eec31e621fc182157251ff34c86a130c9eae7f Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 3 Feb 2025 15:54:54 +0000 Subject: [PATCH 16/21] Support deprecated contig param in fst_gwss() --- malariagen_data/anoph/fst.py | 38 ++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 11297ddf0..40073dbcc 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -1,4 +1,5 @@ from typing import Tuple, Optional +import warnings import numpy as np import pandas as pd @@ -105,10 +106,10 @@ def _fst_gwss( ) def fst_gwss( self, - region: base_params.region, - window_size: fst_params.window_size, - cohort1_query: base_params.sample_query, - cohort2_query: base_params.sample_query, + region: Optional[base_params.region] = None, + window_size: Optional[fst_params.window_size] = None, + cohort1_query: Optional[base_params.sample_query] = None, + cohort2_query: Optional[base_params.sample_query] = None, sample_query_options: Optional[base_params.sample_query_options] = None, sample_sets: Optional[base_params.sample_sets] = None, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, @@ -123,11 +124,40 @@ def fst_gwss( inline_array: base_params.inline_array = base_params.inline_array_default, chunks: base_params.chunks = base_params.native_chunks, clip_min: fst_params.clip_min = 0.0, + contig: Optional[base_params.region] = None, # Deprecated ) -> Tuple[np.ndarray, np.ndarray]: # Change this name if you ever change the behaviour of this function, to # invalidate any previously cached data. name = "fst_gwss_v3" + # Specify which quasi-positional args are required. + required_args = ("window_size", "cohort1_query", "cohort2_query") + + # Raise an error for any missing required args. + missing_args = [] + for required_arg in required_args: + if locals()[required_arg] is None: + missing_args.append(required_arg) + if missing_args: + raise ValueError(f"Missing required arguments: '{missing_args}'") + + if contig is not None: + # Get the current warning filters. + original_warning_filters = warnings.filters[:] + + # Trigger the warning. + warnings.simplefilter("default", DeprecationWarning) + warnings.warn( + "The 'contig' parameter has been deprecated. Please use 'region' instead.", + DeprecationWarning, + ) + + # Restore the original warning filters. + warnings.filters = original_warning_filters + + # If contig and region are both given, then prefer region. + region = contig if region is None else region + params = dict( region=region, window_size=window_size, From eb463e9d11dfa6a5027d457cdf5ee2a23648ad39 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 3 Feb 2025 17:00:22 +0000 Subject: [PATCH 17/21] Raise ValueError for missing required alternative args in fst_gwss() --- malariagen_data/anoph/fst.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 40073dbcc..20ade716c 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -136,10 +136,27 @@ def fst_gwss( # Raise an error for any missing required args. missing_args = [] for required_arg in required_args: - if locals()[required_arg] is None: + if locals().get(required_arg) is None: missing_args.append(required_arg) if missing_args: - raise ValueError(f"Missing required arguments: '{missing_args}'") + raise ValueError(f"Missing required arguments: {missing_args}") + + # Specify which sets of alternative args are required. + required_alternative_arg_sets = (("contig", "region"),) + + # Raise an error for any missing required alternative args. + missing_alt_args = [] + for args_set in required_alternative_arg_sets: + # Check if all alternative arguments are missing + args_set_values = [] + for arg in args_set: + args_set_values.append(locals().get(arg)) + if all(args_set_values): + missing_alt_args.append(args_set) + if missing_alt_args: + raise ValueError( + f"Missing required alternative arguments: {missing_alt_args}" + ) if contig is not None: # Get the current warning filters. From 88a87cba3b6ab21348ca63912cec176213f6049c Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 3 Feb 2025 17:15:12 +0000 Subject: [PATCH 18/21] Fix logic bug in fst_gwss() re missing alt args --- malariagen_data/anoph/fst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 20ade716c..2bddf7a96 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -151,7 +151,7 @@ def fst_gwss( args_set_values = [] for arg in args_set: args_set_values.append(locals().get(arg)) - if all(args_set_values): + if not any(args_set_values): missing_alt_args.append(args_set) if missing_alt_args: raise ValueError( From 8a0aaba723576fa980da347d91602f362e9abcd6 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 4 Feb 2025 11:31:59 +0000 Subject: [PATCH 19/21] Copy locals() in fst_gwss() --- malariagen_data/anoph/fst.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 2bddf7a96..8baaee655 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -130,13 +130,16 @@ def fst_gwss( # invalidate any previously cached data. name = "fst_gwss_v3" + # Get a copy of the local variables, which will include all provided function parameters. + local_vars = locals().copy() + # Specify which quasi-positional args are required. required_args = ("window_size", "cohort1_query", "cohort2_query") # Raise an error for any missing required args. missing_args = [] for required_arg in required_args: - if locals().get(required_arg) is None: + if local_vars.get(required_arg) is None: missing_args.append(required_arg) if missing_args: raise ValueError(f"Missing required arguments: {missing_args}") @@ -150,7 +153,7 @@ def fst_gwss( # Check if all alternative arguments are missing args_set_values = [] for arg in args_set: - args_set_values.append(locals().get(arg)) + args_set_values.append(local_vars.get(arg)) if not any(args_set_values): missing_alt_args.append(args_set) if missing_alt_args: From 51249f1eb75f55f52df16ab00133ca93c32bd4f5 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Thu, 3 Jul 2025 10:48:39 +0100 Subject: [PATCH 20/21] Add _resolve_region_with_deprec_contig_param. Use in fst_gwss. --- malariagen_data/anoph/fst.py | 44 ++++++++---------------------------- malariagen_data/util.py | 27 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 25c899761..cae8a3915 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -1,5 +1,4 @@ from typing import Tuple, Optional -import warnings import numpy as np import pandas as pd @@ -12,7 +11,7 @@ from .snp_data import AnophelesSnpData from . import base_params, fst_params, gplt_params, plotly_params -from ..util import CacheMiss, check_types +from ..util import CacheMiss, check_types, _resolve_region_with_deprec_contig_param class AnophelesFstAnalysis( @@ -134,6 +133,7 @@ def fst_gwss( local_vars = locals().copy() # Specify which quasi-positional args are required. + # Note: to avoid this, we should move towards a keyword-only version of this function. required_args = ("window_size", "cohort1_query", "cohort2_query") # Raise an error for any missing required args. @@ -144,42 +144,16 @@ def fst_gwss( if missing_args: raise ValueError(f"Missing required arguments: {missing_args}") - # Specify which sets of alternative args are required. - required_alternative_arg_sets = (("contig", "region"),) - - # Raise an error for any missing required alternative args. - missing_alt_args = [] - for args_set in required_alternative_arg_sets: - # Check if all alternative arguments are missing - args_set_values = [] - for arg in args_set: - args_set_values.append(local_vars.get(arg)) - if not any(args_set_values): - missing_alt_args.append(args_set) - if missing_alt_args: - raise ValueError( - f"Missing required alternative arguments: {missing_alt_args}" - ) - - if contig is not None: - # Get the current warning filters. - original_warning_filters = warnings.filters[:] - - # Trigger the warning. - warnings.simplefilter("default", DeprecationWarning) - warnings.warn( - "The 'contig' parameter has been deprecated. Please use 'region' instead.", - DeprecationWarning, - ) - - # Restore the original warning filters. - warnings.filters = original_warning_filters + resolved_region = _resolve_region_with_deprec_contig_param( + region=region, contig=contig + ) - # If contig and region are both given, then prefer region. - region = contig if region is None else region + # Delete original parameters to prevent accidental use. + del region + del contig params = dict( - region=region, + region=resolved_region, window_size=window_size, cohort1_query=cohort1_query, cohort2_query=cohort2_query, diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 09e8fbe91..2b0bc74fc 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -1701,3 +1701,30 @@ def _make_sample_period_year(row): return pd.Period(freq="Y", year=year) else: return pd.NaT + + +def _resolve_region_with_deprec_contig_param(*, region, contig): + """ + This helper function should be used by any public function that accepts both a `region` and a `contig` parameter. + The `contig` parameter is now deprecated, so we need to determine which value the `region` should have. + This function returns the determined value for the `region` based on the given parameters. + """ + + if contig is None: + # A `contig` has not been given, so return whatever `region` is. + return region + elif region is None: + # A `contig` has been given, and a `region` has not been given. + # Raise a `DeprecationWarning` for the `contig` param. + # Note: this might not be shown due to warning filters. + warnings.warn( + "The 'contig' parameter has been deprecated. Please use 'region' instead.", + DeprecationWarning, + ) + # The given `contig` should be a valid `region`, so return the given `contig` as the region. + return contig + else: + # Both a `contig` and a `region` have been given. + raise ValueError( + "Both 'region' and 'contig' parameters were provided. Please provide a 'region' parameter only. The 'contig' parameter has been deprecated." + ) From e014a5cb3d5a3e2c5d448dc98dc9a6ccb71c1f19 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:39:30 +0100 Subject: [PATCH 21/21] Re-remove _make_sample_period_... funcs from util.py, defined in frq_base.py --- malariagen_data/util.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index e3c312353..08f0878be 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -1605,32 +1605,6 @@ def add_frequency_ci(*, ds, ci_method): ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp -def _make_sample_period_month(row): - year = row.year - month = row.month - if year > 0 and month > 0: - return pd.Period(freq="M", year=year, month=month) - else: - return pd.NaT - - -def _make_sample_period_quarter(row): - year = row.year - month = row.month - if year > 0 and month > 0: - return pd.Period(freq="Q", year=year, month=month) - else: - return pd.NaT - - -def _make_sample_period_year(row): - year = row.year - if year > 0: - return pd.Period(freq="Y", year=year) - else: - return pd.NaT - - def _resolve_region_with_deprec_contig_param(*, region, contig): """ This helper function should be used by any public function that accepts both a `region` and a `contig` parameter.